Pythonコード
# プログラム名 / Program Name: word_vector_methods_visualization.py
# 概要 / Overview:
# 「日本・アメリカ・韓国・数学・AI」という単語をBoW / TF-IDF / SBERT でベクトル化し、
# コサイン類似度と角度を計算・可視化します。
# Visualize cosine similarity and angular distance for words using BoW, TF-IDF, and SBERT.
# --- 必要なライブラリのインストール / Install required libraries ---
!pip install scikit-learn -q
!pip install -U sentence-transformers -q
# --- 必要なライブラリのインポート / Import libraries ---
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
# --- ダミー文(単語文脈)/ Dummy sentences to simulate context ---
# BoW / TF-IDF では文単位での情報が必要なので、単語を含む簡単な文を準備します。
docs = [
"Japan is a country in Asia. It has advanced technology and culture.",
"America is known as the United States. It is a leading country in AI and mathematics.",
"Korea is a country in East Asia. It is famous for technology and culture.",
"Mathematics is an important academic field.",
"AI (artificial intelligence) is the technology of the future."
]
# ターゲット単語リスト / Target word list
words = ["Japan", "America", "Korea", "mathematics", "AI"]
# --- BoW(Bag of Words)ベクトル化 / Convert text to BoW vectors ---
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(docs).toarray()
# --- TF-IDF ベクトル化 / Convert text to TF-IDF vectors ---
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(docs).toarray()
# --- 共通:PCAによる2次元圧縮と可視化関数 / PCA and plot function ---
def plot_pca(vectors, title, word_labels):
# PCAで2次元に次元削減 / Reduce dimension to 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(vectors)
# プロット描画 / Draw 2D scatter plot
plt.figure(figsize=(6, 5))
for i, label in enumerate(word_labels):
plt.scatter(X_pca[i, 0], X_pca[i, 1])
plt.text(X_pca[i, 0] + 0.01, X_pca[i, 1] + 0.01, label, fontsize=12)
plt.title(title)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.axis('equal')
plt.show()
# --- BoWのコサイン類似度と角度 / Cosine similarity & angle for BoW ---
print("【BoW】Cosine Similarity and Angle")
sim_bow = cosine_similarity(X_bow)
angle_bow = np.degrees(np.arccos(np.clip(sim_bow, -1.0, 1.0)))
for i in range(len(words)):
for j in range(i + 1, len(words)):
print(f"{words[i]} vs {words[j]} : cosθ={sim_bow[i, j]:.3f} θ={angle_bow[i, j]:.1f}°")
plot_pca(X_bow, "2D Visualization of BoW", words)
# --- TF-IDFのコサイン類似度と角度 / Cosine similarity & angle for TF-IDF ---
print("\n【TF-IDF】Cosine Similarity and Angle")
sim_tfidf = cosine_similarity(X_tfidf)
angle_tfidf = np.degrees(np.arccos(np.clip(sim_tfidf, -1.0, 1.0)))
for i in range(len(words)):
for j in range(i + 1, len(words)):
print(f"{words[i]} vs {words[j]} : cosθ={sim_tfidf[i, j]:.3f} θ={angle_tfidf[i, j]:.1f}°")
plot_pca(X_tfidf, "2D Visualization of TF-IDF", words)
# --- SBERTによる単語ベクトル / Word embeddings using SBERT ---
# モデル all-MiniLM-L6-v2 は軽量・高速で多言語に対応
model = SentenceTransformer('all-MiniLM-L6-v2')
vectors_sbert = model.encode(words)
# --- SBERTのコサイン類似度と角度 / Cosine similarity & angle for SBERT ---
print("\n【BERT/SBERT】Cosine Similarity and Angle")
sim_sbert = cosine_similarity(vectors_sbert)
angle_sbert = np.degrees(np.arccos(np.clip(sim_sbert, -1.0, 1.0)))
for i in range(len(words)):
for j in range(i + 1, len(words)):
print(f"{words[i]} vs {words[j]} : cosθ={sim_sbert[i, j]:.3f} θ={angle_sbert[i, j]:.1f}°")
plot_pca(vectors_sbert, "2D Visualization of SBERT Embeddings", words)
結果