言葉のベクトル手法自然言語処理教材(日記)

Posted at 2025-04-16
Pythonコード

# プログラム名 / Program Name: word_vector_methods_visualization.py
# 概要 / Overview:
# 「日本・アメリカ・韓国・数学・AI」という単語をBoW / TF-IDF / SBERT でベクトル化し、
# コサイン類似度と角度を計算・可視化します。
# Visualize cosine similarity and angular distance for words using BoW, TF-IDF, and SBERT.

# --- 必要なライブラリのインストール / Install required libraries ---
!pip install scikit-learn -q
!pip install -U sentence-transformers -q

# --- 必要なライブラリのインポート / Import libraries ---
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# --- ダミー文（単語文脈）/ Dummy sentences to simulate context ---
# BoW / TF-IDF では文単位での情報が必要なので、単語を含む簡単な文を準備します。
docs = [
    "Japan is a country in Asia. It has advanced technology and culture.",
    "America is known as the United States. It is a leading country in AI and mathematics.",
    "Korea is a country in East Asia. It is famous for technology and culture.",
    "Mathematics is an important academic field.",
    "AI (artificial intelligence) is the technology of the future."
]

# ターゲット単語リスト / Target word list
words = ["Japan", "America", "Korea", "mathematics", "AI"]

# --- BoW（Bag of Words）ベクトル化 / Convert text to BoW vectors ---
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(docs).toarray()

# --- TF-IDF ベクトル化 / Convert text to TF-IDF vectors ---
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(docs).toarray()

# --- 共通：PCAによる2次元圧縮と可視化関数 / PCA and plot function ---
def plot_pca(vectors, title, word_labels):
    # PCAで2次元に次元削減 / Reduce dimension to 2D using PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(vectors)

    # プロット描画 / Draw 2D scatter plot
    plt.figure(figsize=(6, 5))
    for i, label in enumerate(word_labels):
        plt.scatter(X_pca[i, 0], X_pca[i, 1])
        plt.text(X_pca[i, 0] + 0.01, X_pca[i, 1] + 0.01, label, fontsize=12)
    plt.title(title)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.grid(True)
    plt.axis('equal')
    plt.show()

# --- BoWのコサイン類似度と角度 / Cosine similarity & angle for BoW ---
print("【BoW】Cosine Similarity and Angle")
sim_bow = cosine_similarity(X_bow)
angle_bow = np.degrees(np.arccos(np.clip(sim_bow, -1.0, 1.0)))
for i in range(len(words)):
    for j in range(i + 1, len(words)):
        print(f"{words[i]} vs {words[j]} : cosθ={sim_bow[i, j]:.3f} θ={angle_bow[i, j]:.1f}°")
plot_pca(X_bow, "2D Visualization of BoW", words)

# --- TF-IDFのコサイン類似度と角度 / Cosine similarity & angle for TF-IDF ---
print("\n【TF-IDF】Cosine Similarity and Angle")
sim_tfidf = cosine_similarity(X_tfidf)
angle_tfidf = np.degrees(np.arccos(np.clip(sim_tfidf, -1.0, 1.0)))
for i in range(len(words)):
    for j in range(i + 1, len(words)):
        print(f"{words[i]} vs {words[j]} : cosθ={sim_tfidf[i, j]:.3f} θ={angle_tfidf[i, j]:.1f}°")
plot_pca(X_tfidf, "2D Visualization of TF-IDF", words)

# --- SBERTによる単語ベクトル / Word embeddings using SBERT ---
# モデル all-MiniLM-L6-v2 は軽量・高速で多言語に対応
model = SentenceTransformer('all-MiniLM-L6-v2')
vectors_sbert = model.encode(words)

# --- SBERTのコサイン類似度と角度 / Cosine similarity & angle for SBERT ---
print("\n【BERT/SBERT】Cosine Similarity and Angle")
sim_sbert = cosine_similarity(vectors_sbert)
angle_sbert = np.degrees(np.arccos(np.clip(sim_sbert, -1.0, 1.0)))
for i in range(len(words)):
    for j in range(i + 1, len(words)):
        print(f"{words[i]} vs {words[j]} : cosθ={sim_sbert[i, j]:.3f} θ={angle_sbert[i, j]:.1f}°")
plot_pca(vectors_sbert, "2D Visualization of SBERT Embeddings", words)
結果
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up