# Program Name: pokemon_statistical_analysis.py
# Creation Date: 20250417
# Overview: A program to analyze Pokémon data using histograms, boxplots, correlation, regression, time-series plots, and odds ratio
# Usage: To run the program, use the command `python pokemon_statistical_analysis.py` in the terminal
# --- ライブラリのインストール(Google Colab等で使用時に有効)/ Install required libraries ---
# !pip install pandas matplotlib seaborn scikit-learn statsmodels
# --- ライブラリのインポート / Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import statsmodels.api as sm
# === 数値と設定を一箇所に集約 / Centralized parameter definitions ===
# ポケモンのステータスデータ / Pokémon stats data
pokemon_data = {
'No': [1, 2, 3, 6, 25, 26, 143, 149, 150],
'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charizard', 'Pikachu', 'Raichu', 'Snorlax', 'Dragonite', 'Mewtwo'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130]
}
hp_column = 'HP'
attack_column = 'Attack'
defense_column = 'Defense'
speed_column = 'Speed'
stat_columns = [hp_column, attack_column, defense_column, speed_column]
# --- データフレームの作成 / Create DataFrame ---
df = pd.DataFrame(pokemon_data)
# === ヒストグラムと累積分布 Histogram & Cumulative Distribution ===
plt.figure(figsize=(10, 4))
plt.hist(df[hp_column], bins=5, alpha=0.7, edgecolor='black')
plt.title('Histogram of HP')
plt.xlabel('HP')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 4))
plt.hist(df[hp_column], bins=5, cumulative=True, alpha=0.7, edgecolor='black')
plt.title('Cumulative Distribution of HP')
plt.xlabel('HP')
plt.ylabel('Cumulative Frequency')
plt.grid(True)
plt.show()
# === 中心と散らばりの指標 / Central Tendency and Dispersion ===
mean_hp = df[hp_column].mean()
median_hp = df[hp_column].median()
std_hp = df[hp_column].std()
var_hp = df[hp_column].var()
range_hp = df[hp_column].max() - df[hp_column].min()
print(f"Mean HP: {mean_hp:.2f} / 平均")
print(f"Median HP: {median_hp:.2f} / 中央値")
print(f"Std Dev HP: {std_hp:.2f} / 標準偏差")
print(f"Variance HP: {var_hp:.2f} / 分散")
print(f"Range HP: {range_hp} / 範囲")
# === 要約統計量と箱ひげ図 / Summary Stats & Boxplot ===
summary_stats = df[stat_columns].describe()
print("\nSummary Statistics / 要約統計量:\n", summary_stats)
plt.figure(figsize=(10, 5))
sns.boxplot(data=df[stat_columns])
plt.title('Boxplot of Stats')
plt.ylabel('Stat Value')
plt.grid(True)
plt.show()
# === 散布図と相関係数 / Scatter Plot & Correlation ===
plt.figure(figsize=(8, 6))
sns.scatterplot(x=attack_column, y=hp_column, data=df)
plt.title('Scatter Plot: Attack vs HP')
plt.xlabel('Attack')
plt.ylabel('HP')
plt.grid(True)
plt.show()
corr, _ = pearsonr(df[attack_column], df[hp_column])
print(f"Pearson Correlation (Attack vs HP): {corr:.2f} / ピアソン相関係数")
# === 回帰直線と決定係数 / Regression Line & R² ===
X = df[[attack_column]]
y = df[hp_column]
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
plt.figure(figsize=(8, 6))
plt.scatter(X, y, label='Data')
plt.plot(X, y_pred, color='red', label='Regression Line')
plt.title('Linear Regression: Attack vs HP')
plt.xlabel('Attack')
plt.ylabel('HP')
plt.legend()
plt.grid(True)
plt.show()
print(f"R-squared: {r2:.3f} / 決定係数")
# === 時系列データプロット / Time Series Description ===
plt.figure(figsize=(10, 4))
plt.plot(df['No'], df[hp_column], marker='o')
plt.title('Time Series-like Plot of HP by No')
plt.xlabel('Pokémon No')
plt.ylabel('HP')
plt.grid(True)
plt.show()
# === 自己相関(ラグ1) / Autocorrelation (Lag-1) ===
acf_vals = sm.tsa.acf(df[hp_column], nlags=3)
print(f"Autocorrelation (lags 0-3): {acf_vals} / 自己相関")
# === クロス集計とオッズ比 / Crosstab and Odds Ratio ===
df['High_HP'] = df[hp_column] >= df[hp_column].median()
df['High_Attack'] = df[attack_column] >= df[attack_column].median()
crosstab = pd.crosstab(df['High_HP'], df['High_Attack'])
print("\nCrosstab:\n", crosstab)
# オッズ比の計算 / Calculate Odds Ratio
try:
odds_ratio = (crosstab.loc[True, True] * crosstab.loc[False, False]) / \
(crosstab.loc[True, False] * crosstab.loc[False, True])
print(f"Odds Ratio: {odds_ratio:.2f} / オッズ比")
except ZeroDivisionError:
print("Odds Ratio: Undefined due to division by zero. / ゼロ割により定義できません")
# プログラム名: pokemon_lln_clt_simulation.py
# Program Name: pokemon_lln_clt_simulation.py
# 内容: セクション3の主要トピック(大数の法則と中心極限定理)をポケモン色違いの確率で再現
# Description: Simulate LLN and CLT using the shiny Pokémon encounter rate
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# --- 共通設定 / Common Settings ---
np.random.seed(0)
shiny_prob = 1 / 4096 # 色違いポケモンの理論出現率 / Theoretical shiny Pokémon rate (~0.0244%)
# --- ① 大数の法則(LLN)/ Law of Large Numbers ---
# 色違いポケモンを捕まえ続けると、出現率が理論値に収束する様子を観察
# Observe how the shiny rate converges to the theoretical value as trials increase
trials = 10000 # 捕獲数 / Number of Pokémon caught
shiny_results = np.random.binomial(n=1, p=shiny_prob, size=trials)
shiny_rates = np.cumsum(shiny_results) / np.arange(1, trials + 1)
# --- ② 中心極限定理(CLT)/ Central Limit Theorem ---
# 各町のトレーナーが色違いを30匹捕獲し、その平均を集めたときの分布を確認
# Simulate shiny rate means from multiple towns (samples of size 30)
sample_size = 30
num_samples = 1000
shiny_means = [np.mean(np.random.binomial(1, shiny_prob, sample_size)) for _ in range(num_samples)]
# --- ③ 正規分布との比較 / Normal Distribution Overlay ---
# CLTによりサンプル平均の分布は正規分布に近づくことを確認
# According to CLT, sample means should approach a normal distribution
mean_theory = shiny_prob
std_theory = np.sqrt(shiny_prob * (1 - shiny_prob) / sample_size)
x = np.linspace(0, max(shiny_means) + 0.001, 1000)
y = norm.pdf(x, loc=mean_theory, scale=std_theory)
# --- プロット作成 / Create Plots ---
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
# LLNプロット:色違い出現率の収束 / Convergence of shiny rate
ax[0].plot(shiny_rates, label='Sample Shiny Rate')
ax[0].axhline(shiny_prob, color='red', linestyle='--', label='True Shiny Rate (1/4096)')
ax[0].set_title('LLN: Shiny Pokémon Rate Converges to 1/4096')
ax[0].set_xlabel("Number of Pokémon Caught")
ax[0].set_ylabel("Observed Shiny Rate")
ax[0].legend()
ax[0].grid(True)
# CLTプロット:標本平均分布と正規分布 / Sample means and normal approximation
ax[1].hist(shiny_means, bins=30, density=True, alpha=0.7, color='gold', edgecolor='black', label='Sample Means')
ax[1].plot(x, y, 'r--', label='Normal Approximation')
ax[1].set_title("CLT: Shiny Rate Averages Across Towns (n=30)")
ax[1].set_xlabel("Town Average Shiny Rate")
ax[1].set_ylabel("Density")
ax[1].legend()
ax[1].grid(True)
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, chi2, f
# --- プログラム名 / Program Name ---
# pokemon_confidence_intervals_combined.py
# ポケモンのデータを使って区間推定の例を示す
# Demonstrates confidence intervals using Pokémon data
# ------------------------------
# ① 母平均の区間推定 / Confidence Interval for Population Mean
# 例: ポケモンのHP / Example: HP of Pokémon
# ------------------------------
mu_hp = 60 # 仮定された母平均 / Assumed population mean
sigma_hp = 15 # 母標準偏差(既知)/ Known population std. dev.
n_hp = 36 # サンプル数 / Sample size
xbar_hp = 64 # 標本平均 / Sample mean
z = norm.ppf(0.975) # 95%信頼区間のz値 / z-score for 95% CI
se_hp = sigma_hp / np.sqrt(n_hp) # 標準誤差 / Standard error
ci_hp = (xbar_hp - z * se_hp, xbar_hp + z * se_hp) # 信頼区間 / Confidence interval
# ------------------------------
# ② 母分散の区間推定 / Confidence Interval for Population Variance
# 例: ダメージのばらつき / Example: Damage variance
# ------------------------------
s2_dmg = 225 # 標本分散 / Sample variance
n_dmg = 36 # サンプルサイズ / Sample size
alpha = 0.05 # 有意水準 / Significance level
dof_dmg = n_dmg - 1 # 自由度 / Degrees of freedom
# カイ二乗分布を用いた信頼区間 / CI using chi-square distribution
ci_var = (
(dof_dmg * s2_dmg) / chi2.ppf(1 - alpha / 2, dof_dmg),
(dof_dmg * s2_dmg) / chi2.ppf(alpha / 2, dof_dmg)
)
# ------------------------------
# ③ 母比率の区間推定 / Confidence Interval for Proportion
# 例: 色違いの出現率 / Example: Shiny Pokémon appearance rate
# ------------------------------
x_shiny = 3 # 色違いの数 / Number of shinies
n_shiny = 1000 # 試行回数 / Number of encounters
p_hat = x_shiny / n_shiny # 標本比率 / Sample proportion
se_shiny = np.sqrt(p_hat * (1 - p_hat) / n_shiny) # 標準誤差 / Standard error
ci_prop = (p_hat - z * se_shiny, p_hat + z * se_shiny) # 信頼区間 / Confidence interval
# ------------------------------
# ④ 分散比の区間推定 / Confidence Interval for Variance Ratio
# 例: 炎技と水技の分散比 / Example: Fire vs Water move variance
# ------------------------------
s1, n1 = 10, 16 # 炎タイプの分散とサンプル数 / Fire-type
s2, n2 = 5, 21 # 水タイプの分散とサンプル数 / Water-type
# F分布を用いた信頼区間 / CI using F-distribution
ci_var_ratio = (
(s1 / s2) / f.ppf(1 - alpha / 2, n1 - 1, n2 - 1),
(s1 / s2) / f.ppf(alpha / 2, n1 - 1, n2 - 1)
)
# ------------------------------
# 結果出力 / Print Results
# ------------------------------
print("【ポケモンの区間推定まとめ / Pokémon Confidence Interval Summary】")
print(f"① 平均HPの区間推定 / Mean HP CI: {ci_hp}")
print(f"② ダメージ分散の区間推定 / Damage Variance CI: {ci_var}")
print(f"③ 色違い出現率の区間推定 / Shiny Appearance CI: {ci_prop}")
print(f"④ 技の分散比の区間推定(炎/水)/ Move Variance Ratio CI: {ci_var_ratio}")
# ------------------------------
# 可視化 / Visualization of Confidence Intervals
# ------------------------------
labels = ['HP Mean', 'Damage Variance', 'Shiny Rate', 'Variance Ratio']
lower_bounds = [ci_hp[0], ci_var[0], ci_prop[0], ci_var_ratio[0]]
upper_bounds = [ci_hp[1], ci_var[1], ci_prop[1], ci_var_ratio[1]]
means = [(l + u) / 2 for l, u in zip(lower_bounds, upper_bounds)]
errors = [(u - l) / 2 for l, u in zip(lower_bounds, upper_bounds)]
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(labels, means, yerr=errors, capsize=10, color='skyblue')
ax.set_title("Pokémon Confidence Intervals") # グラフタイトル / Title
ax.set_ylabel("Estimated Value") # y軸ラベル / y-axis label
ax.grid(True)
plt.tight_layout()
plt.show()
# Program Name: pokemon_probability_stats.py
# Creation Date: 20250417
# Overview: A program to analyze Pokémon data using probability, expectation, variance, distributions, and correlation
# Usage: To run the program, use the command `python pokemon_probability_stats.py` in the terminal
# --- 必要なライブラリのインストール(Google Colab等の場合有効化)/ Install required libraries ---
# !pip install pandas matplotlib seaborn scipy
# --- ライブラリのインポート / Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns
# === パラメータの一元管理 / Centralized parameter definitions ===
pokemon_data = {
'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charizard', 'Pikachu', 'Raichu', 'Snorlax', 'Dragonite', 'Mewtwo'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90]
}
stat_target = 'Attack' # 解析対象の変数 / Target variable for probability analysis
var_x = 'Attack' # 相関用X軸変数 / Variable for x-axis (correlation)
var_y = 'Defense' # 相関用Y軸変数 / Variable for y-axis (correlation)
# --- データフレーム作成 / Create DataFrame ---
df = pd.DataFrame(pokemon_data)
# === 期待値・分散・標準偏差の計算 / Calculate Expectation, Variance, Std ===
mean_val = df[stat_target].mean() # 平均値 / Mean
var_val = df[stat_target].var() # 分散 / Variance
std_val = df[stat_target].std() # 標準偏差 / Standard Deviation
# --- Zスコア計算 / Calculate Z-score (Standardization) ---
df[f'{stat_target}_z'] = (df[stat_target] - mean_val) / std_val
# --- 正規分布とヒストグラムのプロット / Histogram and Normal PDF ---
x_range = np.linspace(df[stat_target].min(), df[stat_target].max(), 100)
pdf_values = norm.pdf(x_range, loc=mean_val, scale=std_val)
plt.figure(figsize=(8, 4))
plt.hist(df[stat_target], bins=5, density=True, alpha=0.6, edgecolor='black', label=f'{stat_target} Histogram')
plt.plot(x_range, pdf_values, color='red', label='Normal Approximation')
plt.title(f'{stat_target} Distribution of Pokémon')
plt.xlabel(stat_target)
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# --- 相関関係の散布図 / Scatter plot for correlation ---
plt.figure(figsize=(8, 6))
sns.scatterplot(x=var_x, y=var_y, data=df)
plt.title(f'{var_x} vs {var_y}')
plt.xlabel(var_x)
plt.ylabel(var_y)
plt.grid(True)
plt.show()
# --- 共分散と相関係数 / Covariance and Correlation Coefficient ---
covariance = df[[var_x, var_y]].cov().iloc[0, 1]
correlation = df[[var_x, var_y]].corr().iloc[0, 1]
# --- 結果表示 / Display results ---
print(f"Mean of {stat_target} / 平均: {mean_val:.2f}")
print(f"Variance of {stat_target} / 分散: {var_val:.2f}")
print(f"Standard Deviation of {stat_target} / 標準偏差: {std_val:.2f}")
print(f"Covariance between {var_x} and {var_y} / 共分散: {covariance:.2f}")
print(f"Correlation Coefficient / 相関係数: {correlation:.2f}")
print("\nZ-scored DataFrame:")
print(df[[stat_target, f'{stat_target}_z']])
# プログラム名: pokemon_ci_diff_proportion.py
# 内容: 炎タイプと水タイプで「ひるませ技」成功率の差の区間推定
# Description: Confidence interval for the difference in flinch move success rate between Fire-type and Water-type Pokémon
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# --- データ定義 / Define sample data ---
x1, n1 = 16, 80 # 炎タイプ: 成功回数 / 試行数 (Fire-type: successes / trials)
x2, n2 = 9, 90 # 水タイプ: 成功回数 / 試行数 (Water-type: successes / trials)
# --- 標本比率 / Sample proportions ---
p1 = x1 / n1 # 炎タイプの成功率 / Fire success rate
p2 = x2 / n2 # 水タイプの成功率 / Water success rate
diff = p1 - p2 # 成功率の差 / Difference in proportions
# --- 標準誤差とZ値 / Standard error and z-value ---
se = np.sqrt((p1 * (1 - p1)) / n1 + (p2 * (1 - p2)) / n2)
z = norm.ppf(0.975) # 95%信頼区間に対応するz値 / z-score for 95% CI
# --- 区間推定 / Compute confidence interval ---
ci_lower = diff - z * se
ci_upper = diff + z * se
# --- 結果表示 / Print results ---
print("🔥 Fire-Type Success Rate:", round(p1, 3))
print("💧 Water-Type Success Rate:", round(p2, 3))
print("📏 Difference (Fire - Water):", round(diff, 3))
print("🧾 95% Confidence Interval:", f"[{ci_lower:.3f}, {ci_upper:.3f}]")
# --- グラフ表示 / Visualization ---
fig, ax = plt.subplots(figsize=(8, 5))
# バーのラベルと値 / Labels and bar heights
labels = ['Fire', 'Water', 'Difference']
values = [p1, p2, diff]
errors = [z * np.sqrt(p1 * (1 - p1) / n1), z * np.sqrt(p2 * (1 - p2) / n2), z * se]
# 棒グラフ + エラーバー / Bar chart with error bars
bars = ax.bar(labels, values, yerr=errors, capsize=10, color=['orangered', 'dodgerblue', 'gray'])
# 0ラインを描画 / Horizontal line at y=0
ax.axhline(0, color='black', linestyle='--', linewidth=1)
# グラフの装飾 / Chart styling
ax.set_ylabel("Success Rate / Difference")
ax.set_title("Confidence Interval for Flinch Move Success Rate\n(Fire vs Water Type Pokémon)")
ax.grid(True, linestyle=':', alpha=0.7)
plt.tight_layout()
plt.show()
# プログラム名: pokemon_test_diff_proportion.py
# 内容: 母比率の差に関する仮説検定(2標本問題)
# Description: Hypothesis test for difference in proportions (Fire vs Water Pokémon)
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
# --- 標本データ / Sample data ---
x1, n1 = 16, 80 # 炎タイプ: 成功回数 / 試行数 (Fire-type: successes / trials)
x2, n2 = 9, 90 # 水タイプ: 成功回数 / 試行数 (Water-type: successes / trials)
# --- 標本比率 / Sample proportions ---
p1 = x1 / n1 # Fire success rate
p2 = x2 / n2 # Water success rate
# --- プールされた比率 / Pooled proportion ---
# 帰無仮説: p1 = p2 のもとで全体の成功率を計算
# Under H0: p1 = p2, compute pooled success rate
p_pooled = (x1 + x2) / (n1 + n2)
# --- 標準誤差(プール法)/ Standard error under H0 (pooled SE) ---
se_pooled = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
# --- Z値の計算 / Calculate Z-statistic ---
z = (p1 - p2) / se_pooled
# --- 両側検定のP値 / Two-tailed p-value ---
p_value = 2 * (1 - norm.cdf(abs(z)))
# --- 結果表示 / Display results ---
print("🔥 Fire-Type Success Rate:", round(p1, 3))
print("💧 Water-Type Success Rate:", round(p2, 3))
print("📊 Z-score:", round(z, 3))
print("📉 P-value:", round(p_value, 4))
# --- 結論 / Hypothesis decision ---
alpha = 0.05 # 有意水準 / Significance level
if p_value < alpha:
print("✅ 結論: 有意差あり(帰無仮説棄却) / Significant difference (Reject H0)")
else:
print("❎ 結論: 有意差なし(帰無仮説を棄却できない) / No significant difference (Fail to reject H0)")
# --- プロット / Visual representation of the test ---
x = np.linspace(-4, 4, 500)
y = norm.pdf(x)
plt.figure(figsize=(10, 5))
plt.plot(x, y, label="Standard Normal Distribution") # 標準正規分布 / Standard normal curve
plt.axvline(z, color='red', linestyle='--', label=f"Z = {z:.2f}") # Z値の線 / Z-score line
plt.fill_between(x, y, where=(abs(x) > abs(z)), color='orange', alpha=0.4, label="Rejection Region") # 棄却域 / Rejection region
plt.title("Hypothesis Test for Difference in Proportions\n(Fire vs Water Pokémon)")
plt.xlabel("Z-value")
plt.ylabel("Density")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
# --- プログラム名: pokemon_chi_square_tests_combined.py ---
# 内容: 適合度の検定と独立性の検定を統合し、グラフ付きで表示するポケモン統計プログラム
# ------------------------------
# 適合度の検定 / Goodness-of-Fit
# ------------------------------
# 観測データ:通常色と色違い
observed_gof = np.array([9998, 2])
expected_ratio = np.array([1 - 1/4096, 1/4096])
expected_gof = expected_ratio * observed_gof.sum()
# カイ二乗適合度の検定
chi2_stat, p_value_gof = stats.chisquare(f_obs=observed_gof, f_exp=expected_gof)
# ------------------------------
# 独立性の検定 / Test of Independence
# ------------------------------
# クロス集計データ:タイプ × 状態異常
data_indep = pd.DataFrame({
'Burn': [30, 5],
'Paralyze': [10, 25]
}, index=['Fire', 'Water'])
chi2_indep, p_value_indep, dof, expected_indep = stats.chi2_contingency(data_indep)
# ------------------------------
# 結果表示 / Display Results
# ------------------------------
print("【適合度の検定 / Goodness-of-Fit Test】")
print("観測値:", observed_gof)
print("期待値:", expected_gof.round(2))
print(f"Chi2統計量: {chi2_stat:.3f}, P値: {p_value_gof:.5f}")
if p_value_gof < 0.05:
print("✅ 結論: 理論比率と一致しない(有意差あり)")
else:
print("❎ 結論: 理論比率と一致(有意差なし)")
print("\n【独立性の検定 / Test of Independence】")
print("観測データ:\n", data_indep)
print("期待度数:\n", pd.DataFrame(expected_indep, index=data_indep.index, columns=data_indep.columns).round(2))
print(f"Chi2統計量: {chi2_indep:.3f}, P値: {p_value_indep:.4f}")
if p_value_indep < 0.05:
print("✅ 結論: タイプと状態異常に関係あり(独立でない)")
else:
print("❎ 結論: タイプと状態異常は独立(関係なし)")
# ------------------------------
# プロット / Plot
# ------------------------------
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# 適合度の検定:棒グラフ
labels = ['Normal', 'Shiny']
x = np.arange(len(labels))
width = 0.35
ax[0].bar(x - width/2, observed_gof, width, label='Observed', alpha=0.7)
ax[0].bar(x + width/2, expected_gof, width, label='Expected', alpha=0.7)
ax[0].set_title('Goodness-of-Fit: Shiny Rate')
ax[0].set_xticks(x)
ax[0].set_xticklabels(labels)
ax[0].set_ylabel("Count")
ax[0].legend()
ax[0].grid(True)
# 独立性の検定:クロス集計
data_indep.plot(kind='bar', ax=ax[1])
ax[1].set_title('Independence Test: Type vs Status')
ax[1].set_ylabel("Count")
ax[1].legend(title="Status")
ax[1].grid(True)
plt.tight_layout()
plt.show()
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# --- プログラム名: pokemon_hypothesis_tests_with_plot.py ---
# 内容: ポケモンのデータを使った仮説検定(平均・分散・比率・差)とP値の可視化
# Description: Hypothesis testing (mean, variance, proportion, difference) with Pokémon data
# ------------------------------
# ① 母平均の仮説検定(1標本) / One-sample test for mean
sample_hp = np.array([62, 58, 59, 63, 61, 60, 64, 59, 62, 58])
mu_hp = 60 # 仮定された母平均 / Hypothesized population mean
x_bar = sample_hp.mean()
s = sample_hp.std(ddof=1)
n = len(sample_hp)
t_stat = (x_bar - mu_hp) / (s / np.sqrt(n))
p_val_mean = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 1))
# ------------------------------
# ② 母分散の仮説検定(1標本) / One-sample test for variance
s2 = sample_hp.var(ddof=1)
chi2_stat = (n - 1) * s2 / 100 # 仮定された母分散 = 100
p_val_var = 2 * min(
stats.chi2.cdf(chi2_stat, df=n - 1),
1 - stats.chi2.cdf(chi2_stat, df=n - 1)
)
# ------------------------------
# ③ 母比率の仮説検定(1標本) / One-sample test for proportion
x_shiny = 4
n_shiny = 8000
p0 = 1 / 4096
p_hat = x_shiny / n_shiny
se = np.sqrt(p0 * (1 - p0) / n_shiny)
z_stat = (p_hat - p0) / se
p_val_prop = 2 * (1 - stats.norm.cdf(abs(z_stat)))
# ------------------------------
# ④ 母平均の差の仮説検定(2標本) / Two-sample test for mean difference
atk_pre = np.array([50, 48, 52, 51, 49])
atk_post = np.array([60, 58, 63, 61, 59])
t2_stat, p_val_diff = stats.ttest_ind(atk_pre, atk_post)
# ------------------------------
# ⑤ 母分散比の仮説検定(2標本) / Two-sample test for variance ratio
var1 = np.var([40, 42, 43, 38, 41], ddof=1)
var2 = np.var([28, 30, 29, 31, 27], ddof=1)
f_stat = var1 / var2
df1, df2 = 4, 4
p_val_f = 2 * min(
stats.f.cdf(f_stat, df1, df2),
1 - stats.f.cdf(f_stat, df1, df2)
)
# ------------------------------
# 結果表示 / Print Results
print("【ポケモンの仮説検定まとめ / Pokémon Hypothesis Test Summary】")
print(f"① 平均HP = 60?: t = {t_stat:.3f}, p = {p_val_mean:.4f}")
print(f"② 分散 = 100?: χ² = {chi2_stat:.3f}, p = {p_val_var:.4f}")
print(f"③ 色違い率 = 1/4096?: z = {z_stat:.3f}, p = {p_val_prop:.4f}")
print(f"④ 進化前後の攻撃力差: t = {t2_stat:.3f}, p = {p_val_diff:.4f}")
print(f"⑤ 技ばらつき差(炎/水): F = {f_stat:.3f}, p = {p_val_f:.4f}")
# ------------------------------
# P値の可視化 / P-value Visualization
labels = [
"HP = 60 ?",
"Variance = 100 ?",
"Shiny Rate = 1/4096 ?",
"Attack Before vs After",
"Variance Fire vs Water"
]
p_values = [p_val_mean, p_val_var, p_val_prop, p_val_diff, p_val_f]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Hypothesis Test Results for Pokémon Data")
plt.ylabel("P-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()
# 各バーの上にP値を表示 / Annotate p-values above bars
for bar, p in zip(bars, p_values):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
ha='center', va='bottom')
plt.tight_layout()
plt.show()
# 前のセルがリセットされたので再定義が必要
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew, kurtosis
import pprint
# --- データの準備 / Prepare simplified Pokémon data ---
data = {
"名前": ["フシギダネ", "フシギソウ", "フシギバナ", "ヒトカゲ", "リザード", "リザードン", "ゼニガメ", "カメール", "カメックス", "ミュウツー", "ミュウ"],
"HP": [45, 60, 80, 39, 58, 78, 44, 59, 79, 106, 100],
"攻撃": [49, 62, 82, 52, 64, 84, 48, 63, 83, 110, 100],
"防御": [49, 63, 83, 43, 58, 78, 65, 80, 100, 90, 100],
"素早": [45, 60, 80, 65, 80, 100, 43, 58, 78, 130, 100],
"特殊": [65, 80, 100, 50, 65, 85, 50, 65, 85, 154, 100],
"合計": [253, 325, 425, 249, 325, 425, 250, 325, 425, 590, 500]
}
df = pd.DataFrame(data)
# --- 統計量の計算 / Compute Descriptive Statistics ---
stats_columns = ["HP", "攻撃", "防御", "素早", "特殊", "合計"]
summary = {
"平均 / Mean": df[stats_columns].mean().to_dict(),
"中央値 / Median": df[stats_columns].median().to_dict(),
"最頻値 / Mode": df[stats_columns].mode().iloc[0].to_dict(),
"標準偏差 / Std Dev": df[stats_columns].std().to_dict(),
"変動係数 / Coef of Var": (df[stats_columns].std() / df[stats_columns].mean()).to_dict(),
"歪度 / Skewness": df[stats_columns].apply(skew).to_dict(),
"尖度 / Kurtosis": df[stats_columns].apply(kurtosis).to_dict()
}
# --- 結果出力 / Display Summary ---
pprint.pprint(summary)
# プログラム名: pokemon_regression_anova_combo.py
# 内容: ポケモンのステータスで回帰分析(単回帰・重回帰)と一元配置分散分析(ANOVA)を行う
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# ------------------------------
# データ作成 / Create Pokémon sample data
data = {
'Name': ['フシギダネ', 'ヒトカゲ', 'ゼニガメ', 'ピカチュウ', 'ライチュウ', 'サンドパン', 'ピクシー', 'ガーディ', 'ウインディ', 'ニョロボン'],
'HP': [45, 39, 44, 35, 60, 75, 95, 55, 90, 90],
'Attack': [49, 52, 48, 55, 90, 100, 70, 70, 110, 95],
'Defense':[49, 43, 65, 40, 55, 110, 73, 45, 80, 95],
'Sp.Atk': [65, 60, 50, 50, 90, 45, 95, 70, 100, 70],
'Sp.Def': [65, 50, 64, 50, 80, 55, 90, 50, 80, 90],
'Speed': [45, 65, 43, 90, 110, 65, 60, 60, 95, 70],
'Type': ['Grass', 'Fire', 'Water', 'Electric', 'Electric', 'Ground', 'Fairy', 'Fire', 'Fire', 'Water']
}
df = pd.DataFrame(data)
# ------------------------------
# 単回帰分析 / Simple Linear Regression: Attack ~ HP
X1 = sm.add_constant(df['HP']) # 定数項追加 / Add intercept
y = df['Attack']
model1 = sm.OLS(y, X1).fit()
# ------------------------------
# 重回帰分析 / Multiple Linear Regression: Attack ~ HP + Sp.Atk + Speed
X2 = df[['HP', 'Sp.Atk', 'Speed']]
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()
# ------------------------------
# 一元配置分散分析 / One-Way ANOVA: Attack ~ Type
anova_model = sm.OLS.from_formula("Attack ~ C(Type)", data=df).fit()
anova_table = sm.stats.anova_lm(anova_model, typ=2)
# ------------------------------
# 結果表示 / Print summaries
print("=== 単回帰分析: Attack ~ HP ===")
print(model1.summary())
print("\n=== 重回帰分析: Attack ~ HP + Sp.Atk + Speed ===")
print(model2.summary())
print("\n=== 一元配置分散分析(Attack ~ Type) ===")
print(anova_table)
# ------------------------------
# 可視化(散布図と回帰直線)/ Visualization
plt.figure(figsize=(8, 5))
sns.regplot(x='HP', y='Attack', data=df, ci=95)
plt.title("Simple Linear Regression: Attack ~ HP")
plt.grid(True)
plt.tight_layout()
plt.show()
# ------------------------------
# 決定係数と調整済み決定係数の比較 / R² and Adjusted R²
r2 = model2.rsquared
r2_adj = model2.rsquared_adj
print(f"\n決定係数 R²: {r2:.4f}")
print(f"自由度調整済み決定係数 R²_adj: {r2_adj:.4f}")
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# --- プログラム名: pokemon_statistics_module_s3to5.py ---
# 内容: セクション3〜5の主要統計処理(確率、推定、仮説検定)とP値プロットの統合
# ------------------------------
# セクション3:確率と分布(例:色違いポケモン)
p_shiny = 1 / 4096
n_trials = 8000
x_shiny = 4
p_hat = x_shiny / n_trials
se_shiny = np.sqrt(p_shiny * (1 - p_shiny) / n_trials)
z_shiny = (p_hat - p_shiny) / se_shiny
p_val_shiny = 2 * (1 - stats.norm.cdf(abs(z_shiny)))
# ------------------------------
# セクション4:区間推定(例:HPの標本平均)
sample_hp = np.array([62, 58, 59, 63, 61, 60, 64, 59, 62, 58])
mu_hp = 60
mean_hp = sample_hp.mean()
std_hp = sample_hp.std(ddof=1)
n = len(sample_hp)
se_hp = std_hp / np.sqrt(n)
t_hp = (mean_hp - mu_hp) / se_hp
p_val_hp = 2 * (1 - stats.t.cdf(abs(t_hp), df=n - 1))
# ------------------------------
# セクション5:仮説検定(例:進化前後の攻撃力)
atk_pre = np.array([50, 48, 52, 51, 49])
atk_post = np.array([60, 58, 63, 61, 59])
t_stat, p_val_attack = stats.ttest_ind(atk_pre, atk_post)
# ------------------------------
# 分散比検定(例:技のばらつき)
var1 = np.var([40, 42, 43, 38, 41], ddof=1)
var2 = np.var([28, 30, 29, 31, 27], ddof=1)
f_stat = var1 / var2
df1, df2 = 4, 4
p_val_var = 2 * min(stats.f.cdf(f_stat, df1, df2), 1 - stats.f.cdf(f_stat, df1, df2))
# ------------------------------
# 可視化:P値のまとめ
labels = ['Shiny Rate Test', 'HP Mean Test', 'Attack Diff Test', 'Variance Ratio Test']
p_values = [p_val_shiny, p_val_hp, p_val_attack, p_val_var]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Summary of Hypothesis Tests (Sections 3 to 5)")
plt.ylabel("P-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()
for bar, p in zip(bars, p_values):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
ha='center', va='bottom')
plt.tight_layout()
plt.show()
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
# --- Re-defining variables due to NameError ---
# Sample Pokémon data
pokemon_data = {
'Name': ['Bulbasaur', 'Charmander', 'Squirtle', 'Pikachu', 'Sandshrew', 'Meowth', 'Psyduck', 'Mankey', 'Growlithe', 'Geodude'],
'Attack': [49, 52, 48, 55, 75, 45, 52, 80, 70, 80],
'HP': [45, 39, 44, 35, 50, 40, 50, 40, 55, 40],
'Type': ['Grass', 'Fire', 'Water', 'Electric', 'Ground', 'Normal', 'Water', 'Fighting', 'Fire', 'Rock']
}
df = pd.DataFrame(pokemon_data)
# Hypothesis Test 1: Is mean attack = 60?
mu_attack = 60
x_bar = df['Attack'].mean()
s = df['Attack'].std(ddof=1)
n = len(df)
t_stat = (x_bar - mu_attack) / (s / np.sqrt(n))
p_attack = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 1))
# Hypothesis Test 2: Is Pikachu's HP lower than the overall mean?
mu_hp = df['HP'].mean()
pikachu_hp = df[df['Name'] == 'Pikachu']['HP'].values[0]
p_hp = stats.norm.cdf(pikachu_hp, loc=mu_hp, scale=df['HP'].std(ddof=1))
# Hypothesis Test 3: Attack comparison between Fire and Water types
atk_fire = df[df['Type'] == 'Fire']['Attack']
atk_water = df[df['Type'] == 'Water']['Attack']
t_type, p_type = stats.ttest_ind(atk_fire, atk_water)
# Hypothesis Test 4: Is attack variance = 100?
s2_attack = df['Attack'].var(ddof=1)
chi2_stat = (n - 1) * s2_attack / 100
p_var = 2 * min(stats.chi2.cdf(chi2_stat, df=n - 1), 1 - stats.chi2.cdf(chi2_stat, df=n - 1))
# --- Plot with English labels and title ---
labels = [
'Attack Mean = 60',
'Pikachu HP < Mean HP',
'Fire vs Water Attack',
'Attack Variance = 100'
]
p_values = [p_attack, p_hp, p_type, p_var]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Hypothesis Tests for Pokémon (Gen 1)")
plt.ylabel("p-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()
# Display p-values above bars
for bar, p in zip(bars, p_values):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
ha='center', va='bottom')
plt.tight_layout()
plt.show()
# プログラム名: pokemon_multi_hit_move_expectation.py
# 内容: 幾何分布に基づくポケモンの連続技の命中回数と期待威力を計算
import numpy as np
import matplotlib.pyplot as plt
# --- 設定 / Parameters ---
p = 0.1 # 各ヒットの失敗確率(miss after hit)
power = 20 # 一撃あたりの威力
max_hits = 10 # 最大ヒット回数(例:ねずみざん)
prob = np.zeros(max_hits + 1)
# --- 幾何分布の上限付きシミュレーション / Capped geometric distribution ---
prob[0] = p # 最初の失敗(1発も当たらない)
for i in range(1, max_hits):
prob[i] = (1 - p)**i * p
prob[max_hits] = 1.0 - np.sum(prob[:-1]) # 最後に補正
# --- 表示 / Print result ---
for i in range(max_hits + 1):
print(f"Prob {i} Hits: {prob[i]:.3f}")
# --- 期待値計算 / Expected value of power ---
num_hits = np.arange(0, max_hits + 1)
expected_damage = np.sum(power * num_hits * prob)
print(f"\nExpected total power: {expected_damage:.2f}")
# --- 可視化 / Plot ---
plt.figure(figsize=(10, 5))
plt.bar(num_hits, prob, color='skyblue')
plt.xlabel("Number of Hits")
plt.ylabel("Probability")
plt.title(f"Hit Distribution of Multi-Hit Move (max {max_hits}, p_miss = {p})")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
# Program Name: pokemon_statistical_analysis_advanced.py
# Creation Date: 20250419
# Overview: Advanced statistical and visual analysis of Pokémon base stats
# Usage: Run with `python pokemon_statistical_analysis_advanced.py`
# --- ライブラリのインストール ---
# !pip install pandas matplotlib seaborn scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# --- データ定義 ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Gengar', 'Machamp'],
'HP': [35, 78, 160, 60, 90],
'Attack': [55, 84, 110, 65, 130],
'Defense': [40, 78, 65, 60, 80],
'Speed': [90, 100, 30, 110, 55],
'Special': [50, 85, 65, 130, 65]
}
df = pd.DataFrame(data)
df['Total'] = df[['HP', 'Attack', 'Defense', 'Speed', 'Special']].sum(axis=1)
# === ⑤ 散布図と相関分析 / Scatterplot and correlation ===
plt.figure(figsize=(6, 5))
sns.scatterplot(x='Attack', y='Speed', data=df, s=100)
corr = df['Attack'].corr(df['Speed'])
plt.title(f"Attack vs Speed (corr = {corr:.2f})")
plt.xlabel("Attack")
plt.ylabel("Speed")
plt.grid(True)
plt.tight_layout()
plt.show()
# === ⑥ 主成分分析 (PCA) / PCA to visualize multi-dimensional stats ===
features = ['HP', 'Attack', 'Defense', 'Speed', 'Special']
X = df[features]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], s=100)
for i, name in enumerate(df['Name']):
plt.text(X_pca[i, 0]+0.01, X_pca[i, 1], name)
plt.title("PCA of Pokémon Stats")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()
# === ⑦ KMeansクラスタリング / Cluster Pokémon by stats ===
kmeans = KMeans(n_clusters=2, random_state=0)
df['Cluster'] = kmeans.fit_predict(X)
plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster'], s=100, palette='Set2')
for i, name in enumerate(df['Name']):
plt.text(X_pca[i, 0]+0.01, X_pca[i, 1], name)
plt.title("Clustered Pokémon by Stats (PCA Space)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()
# === ⑧ レーダーチャート / Radar chart for each Pokémon ===
def radar_chart(row, features, title):
values = row[features].tolist()
values += values[:1] # to close the radar circle
angles = np.linspace(0, 2*np.pi, len(features)+1)
plt.figure(figsize=(5, 5))
ax = plt.subplot(111, polar=True)
ax.plot(angles, values, 'o-', linewidth=2)
ax.fill(angles, values, alpha=0.25)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(features)
ax.set_title(title)
plt.tight_layout()
plt.show()
# レーダーチャートで全ポケモン可視化
for i, row in df.iterrows():
radar_chart(row, features, f"{row['Name']}'s Base Stats")
# プログラム名: pokemon_data_analysis_all_in_one.py
# Program Name: pokemon_data_analysis_all_in_one.py
# 作成日 / Creation Date: 20250420
# 概要 / Overview: ポケモンの種族値データを用いて、データ集計、グラフ化、代表値、ばらつき、標準化、相関、確率計算までを一括実行するプログラム。
# 使い方 / Usage: ターミナルで `python pokemon_data_analysis_all_in_one.py` を実行
# --- 必要なライブラリのインストール / Install required libraries ---
# !pip install pandas matplotlib seaborn numpy scipy
# --- ライブラリのインポート / Import required libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# --- 数値定義 / Centralized parameter definitions ---
pokemon_data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
'Defense': [40, 78, 65, 49, 90, 50, 60, 70],
'Speed': [90, 100, 30, 45, 130, 55, 110, 55],
'HP': [35, 78, 160, 45, 106, 55, 60, 90]
}
df = pd.DataFrame(pokemon_data)
# === 1. データの集計 / Data Aggregation ===
# 1-1 データ表示 / Show raw data
print(df)
# 1-2 平均/中央値などの統計値 / Basic statistics (mean, median)
desc_stats = df.describe()
print(desc_stats)
# 1-3 グラフ表示:種族値の棒グラフ / Bar chart of base stats
df.set_index('Name')[['Attack', 'Defense', 'Speed', 'HP']].plot(kind='bar')
plt.title('Base Stats of Pokémon')
plt.ylabel('Stat Value')
plt.xlabel('Pokémon')
plt.legend(title='Attributes')
plt.tight_layout()
plt.show()
# === 2. さまざまなグラフ / Various Charts ===
# 2-1 クロス集計(速度と攻撃力)/ Crosstab (Speed category vs Attack category)
df['Speed_Level'] = pd.cut(df['Speed'], bins=[0, 50, 100, 150], labels=['Slow', 'Medium', 'Fast'])
df['Attack_Level'] = pd.cut(df['Attack'], bins=[0, 60, 100, 150], labels=['Low', 'Medium', 'High'])
cross_tab = pd.crosstab(df['Speed_Level'], df['Attack_Level'])
print(cross_tab)
# 2-2 モザイク図 / Mosaic plot (optional with statsmodels or external lib)
# 2-3 積み上げ棒グラフ / Stacked bar (Attack + Defense)
df_plot = df.set_index('Name')[['Attack', 'Defense']]
df_plot.plot(kind='bar', stacked=True)
plt.title('Attack and Defense by Pokémon')
plt.ylabel('Stat Total')
plt.xlabel('Pokémon')
plt.legend(title='Attributes')
plt.tight_layout()
plt.show()
# === 3. 時系列データ(サンプル)/ Time Series Example ===
# 3-1 架空の時系列データを生成 / Generate fake time series (e.g., HP over time)
time = pd.date_range(start='2025-01-01', periods=8, freq='M')
hp_time_df = pd.DataFrame({'Date': time, 'HP': df['HP'].values})
hp_time_df.set_index('Date')['HP'].plot(marker='o')
plt.title('HP Over Time')
plt.ylabel('HP')
plt.xlabel('Date')
plt.grid(True)
plt.tight_layout()
plt.show()
# === 4. 代表値と箱ひげ図 / Representative Values and Boxplot ===
mean_hp = df['HP'].mean()
median_hp = df['HP'].median()
mode_hp = df['HP'].mode()[0]
print(f"Mean HP: {mean_hp}, Median HP: {median_hp}, Mode HP: {mode_hp}")
# 箱ひげ図 / Boxplot
sns.boxplot(data=df[['Attack', 'Defense', 'Speed', 'HP']])
plt.title('Boxplot of Pokémon Stats')
plt.ylabel('Value')
plt.tight_layout()
plt.show()
# === 5. データのばらつき / Data Dispersion ===
# 分散・標準偏差・変動係数 / Variance, Std, CV
for col in ['Attack', 'Defense', 'Speed', 'HP']:
var = df[col].var()
std = df[col].std()
cv = std / df[col].mean()
print(f"{col}: Variance={var:.2f}, StdDev={std:.2f}, CV={cv:.2f}")
# === 6. データの標準化 / Data Standardization ===
# レーダーチャート用標準化 / Normalize for radar chart
df_norm = df.copy()
cols = ['Attack', 'Defense', 'Speed', 'HP']
df_norm[cols] = (df[cols] - df[cols].mean()) / df[cols].std()
# 偏差値 / Deviation value calculation
df['HP_deviation'] = 50 + 10 * (df['HP'] - df['HP'].mean()) / df['HP'].std()
print(df[['Name', 'HP', 'HP_deviation']])
# === 7. データの相関 / Correlation Analysis ===
plt.scatter(df['Attack'], df['Speed'], s=df['HP'], alpha=0.5)
plt.title('Bubble Chart: Attack vs Speed')
plt.xlabel('Attack')
plt.ylabel('Speed')
plt.tight_layout()
plt.show()
# 数値列のみに限定して相関係数を計算 / Limit to numeric columns for correlation
corr = df.select_dtypes(include=[np.number]).corr()
print("Correlation Matrix:\n", corr)
# === 8. 確率計算 / Probability Calculation ===
# 条件付き確率(例:攻撃>80かつHP>100の確率)
prob_all = len(df)
prob_attack_hp = len(df[(df['Attack'] > 80) & (df['HP'] > 100)]) / prob_all
print(f"Conditional Probability (Attack > 80 and HP > 100): {prob_attack_hp:.2f}")
# Program Name: statistics_fundamentals_all_in_one.py
# Creation Date: 20250420
# Overview: 統計学の基本トピックを体系的に網羅し、Pythonで一括実行できる学習・可視化ツール
# Usage: To run the program, use the command `python statistics_fundamentals_all_in_one.py` in the terminal
# --- 必要なライブラリのインストール / Install required libraries ---
# !pip install pandas matplotlib seaborn numpy scipy
# --- ライブラリのインポート / Import required libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from math import factorial
# --- データ定義 / Sample data definition ---
np.random.seed(0)
data = np.random.normal(loc=50, scale=10, size=100) # 平均50、標準偏差10の正規分布 / Normal distribution
# === 2. 度数分布とヒストグラム / Frequency Distribution and Histogram ===
plt.hist(data, bins=10, edgecolor='black')
plt.title('Histogram of Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()
cum_counts, bin_edges = np.histogram(data, bins=10)
cum_dist = np.cumsum(cum_counts)
print("Cumulative Frequency Distribution:", cum_dist)
# ローレンツ曲線とジニ係数 / Lorenz Curve and Gini Coefficient
sorted_data = np.sort(data)
cumulative = np.cumsum(sorted_data) / np.sum(sorted_data)
lorenz_curve = np.insert(cumulative, 0, 0)
x = np.linspace(0.0, 1.0, len(lorenz_curve))
plt.plot(x, lorenz_curve, label='Lorenz Curve')
plt.plot(x, x, '--', label='Line of Equality')
plt.title('Lorenz Curve')
plt.xlabel('Cumulative Share of People')
plt.ylabel('Cumulative Share of Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
gini = 1 - 2 * np.trapz(lorenz_curve, x)
print(f"Gini Coefficient: {gini:.3f}")
# === 3. 代表値と分布形状 / Central Tendency and Shape ===
mean_val = np.mean(data)
median_val = np.median(data)
mode_val = pd.Series(data).mode()[0]
skew_val = skew(data)
kurt_val = kurtosis(data)
print(f"Mean: {mean_val:.2f}, Median: {median_val:.2f}, Mode: {mode_val:.2f}")
print(f"Skewness: {skew_val:.2f}, Kurtosis: {kurt_val:.2f}")
sns.boxplot(data=data)
plt.title("Boxplot of Sample Data")
plt.tight_layout()
plt.show()
# === 6. 分散と標準偏差 / Variance and Std Dev ===
var = np.var(data, ddof=1)
std = np.std(data, ddof=1)
cv = std / mean_val
print(f"Variance: {var:.2f}, Std Dev: {std:.2f}, Coefficient of Variation: {cv:.2f}")
# === 7. 場合の数 / Counting ===
n, r = 5, 3
perm = factorial(n) // factorial(n - r) # 順列 / Permutation
comb = factorial(n) // (factorial(r) * factorial(n - r)) # 組合せ / Combination
print(f"P({n},{r}) = {perm}, C({n},{r}) = {comb}")
# === 9. 期待値 / Expected Value ===
dice = np.array([1, 2, 3, 4, 5, 6])
probs = np.full(6, 1/6)
expectation = np.sum(dice * probs)
print(f"Expected value of a fair dice roll: {expectation:.2f}")
# === 10. ベイズの定理 / Bayes' Theorem ===
P_disease = 0.01
P_pos_given_disease = 0.99
P_pos_given_healthy = 0.05
P_healthy = 1 - P_disease
P_positive = (P_pos_given_disease * P_disease) + (P_pos_given_healthy * P_healthy)
P_disease_given_pos = (P_pos_given_disease * P_disease) / P_positive
print(f"Bayes Theorem: P(Disease | Positive) = {P_disease_given_pos:.3f}")
# Program Name: pokemon_probability_distribution_full.py
# Creation Date: 20250420
# Overview: ポケモンの種族値データを使って、離散・連続確率分布、期待値、分散、大数の法則、中心極限定理を一括で学べる統計学統合プログラム
# Usage: python pokemon_probability_distribution_full.py
# --- ライブラリのインポート ---
# --- ポケモン種族値データをさらに拡張 / Further Expanded Pokémon base stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
],
'HP': [
35, 78, 160, 45, 106, 55,
60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80,
79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55,
60, 95, 60, 105, 55, 75
],
'Attack': [
55, 84, 110, 49, 110, 55,
65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82,
83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130,
50, 95, 80, 130, 65, 100
]
}
df = pd.DataFrame(pokemon)
df
# === 11. 離散型・連続型確率分布 ===
mu_attack = np.mean(df['Attack'])
x_poisson = np.arange(0, 200, 1)
pmf_poisson = poisson.pmf(x_poisson, mu_attack)
plt.plot(x_poisson, pmf_poisson, 'o-', label=f'Poisson(μ={mu_attack:.1f})')
plt.title("Poisson Distribution (Attack)")
plt.xlabel("Attack Value")
plt.ylabel("Probability")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# 正規分布(HP)
mu_hp = np.mean(df['HP'])
sigma_hp = np.std(df['HP'], ddof=1)
x_normal = np.linspace(0, 200, 300)
pdf_normal = norm.pdf(x_normal, mu_hp, sigma_hp)
plt.plot(x_normal, pdf_normal, label=f'N({mu_hp:.1f}, {sigma_hp:.1f}²)')
plt.title("Normal Distribution of HP")
plt.xlabel("HP")
plt.ylabel("Density")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# === 12. CDFと期待値・分散 ===
cdf_normal = norm.cdf(x_normal, mu_hp, sigma_hp)
plt.plot(x_normal, cdf_normal)
plt.title("Cumulative Distribution Function (HP)")
plt.xlabel("HP")
plt.ylabel("Cumulative Probability")
plt.grid(True)
plt.tight_layout()
plt.show()
expected_hp = np.mean(df['HP'])
variance_hp = np.var(df['HP'], ddof=1)
print(f"Expected HP: {expected_hp:.2f}, Variance: {variance_hp:.2f}")
# === 13. 離散分布:二項・幾何 ===
x_binom = np.arange(11)
pmf_binom = binom.pmf(x_binom, n=10, p=0.3)
plt.bar(x_binom, pmf_binom)
plt.title("Binomial Distribution (n=10, p=0.3)")
plt.xlabel("Successes")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()
x_geom = np.arange(1, 11)
pmf_geom = geom.pmf(x_geom, p=0.2)
plt.bar(x_geom, pmf_geom)
plt.title("Geometric Distribution (p=0.2)")
plt.xlabel("Trial")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()
# === 14. 標準正規分布とZ変換 ===
df['HP_zscore'] = (df['HP'] - mu_hp) / sigma_hp
print("Standardized HP (Z-scores):\n", df[['Name', 'HP', 'HP_zscore']])
x_std = np.linspace(-4, 4, 300)
plt.plot(x_std, norm.pdf(x_std), label='Standard Normal')
plt.title("Standard Normal Distribution")
plt.xlabel("Z")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()
# === 15. 指数・一様分布 ===
x_exp = np.linspace(0, 10, 300)
plt.plot(x_exp, expon.pdf(x_exp, scale=1), label='Exponential(λ=1)')
plt.title("Exponential Distribution")
plt.xlabel("Time")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()
x_uni = np.linspace(0, 100, 300)
plt.plot(x_uni, uniform.pdf(x_uni, loc=0, scale=100), label='Uniform(0,100)')
plt.title("Uniform Distribution")
plt.xlabel("Value")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()
# === 17. 大数の法則と中心極限定理 ===
samples = np.random.choice(df['Attack'], size=(1000, 100), replace=True)
sample_means = samples.mean(axis=1)
plt.plot(sample_means)
plt.axhline(np.mean(df['Attack']), color='red', linestyle='--', label='True Mean')
plt.title("Law of Large Numbers Simulation")
plt.xlabel("Sample Index")
plt.ylabel("Sample Mean")
plt.legend()
plt.tight_layout()
plt.show()
sample_means_dist = np.random.choice(df['Attack'], size=(1000, 30)).mean(axis=1)
sns.histplot(sample_means_dist, kde=True, stat='density')
plt.title("Central Limit Theorem (n=30)")
plt.xlabel("Sample Mean")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, t, chi2
# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75
]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
n = len(hp_sample)
# === 18. 点推定 / Point Estimation ===
mean_hp = np.mean(hp_sample)
var_hp = np.var(hp_sample, ddof=1) # 不偏分散
std_hp = np.std(hp_sample, ddof=1) # 標準偏差
se_hp = std_hp / np.sqrt(n) # 標準誤差
print("[18] Point Estimation")
print(f"Sample Size n = {n}")
print(f"Sample Mean (\u03bĉ): {mean_hp:.2f}")
print(f"Unbiased Variance (s^2): {var_hp:.2f}")
print(f"Standard Deviation (s): {std_hp:.2f}")
print(f"Standard Error (SE): {se_hp:.2f}\n")
# === 19. 区間推定(母分散既知) / Confidence Interval with Known Variance ===
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
print("[19] Confidence Interval (\u03c3 known)")
print(f"95% CI: {ci_known}\n")
# === 20. 区間推定(母分散未知) / Confidence Interval with Unknown Variance ===
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
print("[20] Confidence Interval (\u03c3 unknown)")
print(f"95% CI: {ci_unknown}\n")
# === 21. 母比率の区間推定 / Population Proportion ===
success = np.sum(hp_sample >= 80)
p_hat = success / n
se_p = np.sqrt(p_hat * (1 - p_hat) / n)
ci_prop = (p_hat - z * se_p, p_hat + z * se_p)
print("[21] Confidence Interval for Proportion (P(HP >= 80))")
print(f"Proportion: {p_hat:.2f}, 95% CI: {ci_prop}\n")
# === 22. 母分散の区間推定 / Variance Estimation ===
chi2_lower = chi2.ppf(0.025, df=n-1)
chi2_upper = chi2.ppf(0.975, df=n-1)
ci_var = ((n - 1) * var_hp / chi2_upper, (n - 1) * var_hp / chi2_lower)
print("[22] Confidence Interval for Population Variance")
print(f"95% CI for Variance: {ci_var}\n")
# --- プロット: 信頼区間の視覚化 / Visualizing Confidence Intervals ---
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr
# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)
# === 18. 点推定 ===
mean_hp = np.mean(hp_sample)
var_hp = np.var(hp_sample, ddof=1)
std_hp = np.std(hp_sample, ddof=1)
se_hp = std_hp / np.sqrt(n)
print("[18] Point Estimation")
print(f"Sample Size n = {n}")
print(f"Sample Mean (μ̂): {mean_hp:.2f}")
print(f"Unbiased Variance (s^2): {var_hp:.2f}")
print(f"Standard Deviation (s): {std_hp:.2f}")
print(f"Standard Error (SE): {se_hp:.2f}\n")
plt.figure()
plt.hist(hp_sample, bins=10, edgecolor='black')
plt.title("Histogram of HP")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
# === 19. 区間推定(母分散既知) ===
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
print("[19] Confidence Interval (σ known)")
print(f"95% CI: {ci_known}\n")
# === 20. 区間推定(母分散未知) ===
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
print("[20] Confidence Interval (σ unknown)")
print(f"95% CI: {ci_unknown}\n")
# === 21. 母比率の区間推定 ===
success = np.sum(hp_sample >= 80)
p_hat = success / n
se_p = np.sqrt(p_hat * (1 - p_hat) / n)
ci_prop = (p_hat - z * se_p, p_hat + z * se_p)
print("[21] Confidence Interval for Proportion (P(HP >= 80))")
print(f"Proportion: {p_hat:.2f}, 95% CI: {ci_prop}\n")
plt.figure()
plt.bar(['HP >= 80', 'HP < 80'], [success, n - success], color=['skyblue', 'lightcoral'])
plt.title("HP ≥ 80 vs HP < 80")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
# === 22. 母分散の区間推定 ===
chi2_lower = chi2.ppf(0.025, df=n-1)
chi2_upper = chi2.ppf(0.975, df=n-1)
ci_var = ((n - 1) * var_hp / chi2_upper, (n - 1) * var_hp / chi2_lower)
print("[22] Confidence Interval for Population Variance")
print(f"95% CI for Variance: {ci_var}\n")
# === 23. 検定の基礎と用語 ===
# → 検定とは:仮説(H0, H1)に基づき、サンプルから統計的判断を行う方法。
# → 有意水準α、検出力(1-β)、第1種の過誤(α)、第2種の過誤(β)、両側/片側検定 などを含む。
# === 24. 平均値の検定(t検定) ===
mu0 = 80
print("[24] One-sample t-test against μ0 = 80")
t_stat, p_val = ttest_1samp(hp_sample, mu0)
print(f"t-statistic = {t_stat:.3f}, p-value (two-tailed) = {p_val:.4f}\n")
plt.figure()
plt.hist(hp_sample, bins=10, color='lightgreen', edgecolor='black')
plt.axvline(mu0, color='red', linestyle='--', label='μ₀ = 80')
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.title("One-sample t-test Visualization")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()
# === 26. 相関分析 ===
print("[26] Correlation Analysis between HP and Attack")
correlation, p_corr = pearsonr(hp_sample, attack_sample)
print(f"Pearson r = {correlation:.3f}, p-value = {p_corr:.4f}\n")
plt.figure()
plt.scatter(hp_sample, attack_sample, alpha=0.7)
plt.title("Scatter Plot: HP vs Attack")
plt.xlabel("HP")
plt.ylabel("Attack")
plt.grid(True)
plt.tight_layout()
plt.show()
# === Visualization: Confidence Interval Summary ===
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)
# === 27. 単回帰分析 / Simple Linear Regression ===
print("[27] Simple Linear Regression: Predict Attack from HP")
X = np.array(hp_sample).reshape(-1, 1)
y = np.array(attack_sample)
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
residuals = y - y_pred
print(f"Regression Equation: Attack = {model.coef_[0]:.2f} * HP + {model.intercept_:.2f}")
print(f"R^2 (coefficient of determination): {r2:.3f}")
print(f"MSE (mean squared error): {mse:.2f}")
print(f"Mean of Residuals: {np.mean(residuals):.2f}\n")
plt.figure()
sns.regplot(x=hp_sample, y=attack_sample, ci=None, line_kws={'color': 'red'})
plt.title("Simple Linear Regression: HP → Attack")
plt.xlabel("HP")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
plt.figure()
plt.scatter(hp_sample, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residual Plot")
plt.xlabel("HP")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()
# === 28. 等分散性検定とWelchのt検定 ===
print("[28] Welch's t-test (HP >= 80 vs HP < 80)")
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
print(f"Welch's t-statistic: {t_stat:.3f}, p-value: {p_val:.4f}")
print(f"Group 1 Mean Attack (HP ≥ 80): {group1.mean():.2f}")
print(f"Group 2 Mean Attack (HP < 80): {group2.mean():.2f}\n")
plt.figure()
sns.boxplot(data=[group1, group2], notch=True)
plt.xticks([0, 1], ["HP ≥ 80", "HP < 80"])
plt.title("Attack Comparison by HP Group (Welch's t-test)")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# === 29. 一元配置分散分析 / One-Way ANOVA ===
print("[29] One-Way ANOVA on Attack grouped by HP Quantiles")
df['HP_Q'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
groups = [df[df['HP_Q'] == q]['Attack'] for q in ['Low', 'Mid', 'High']]
F_stat, p_val = f_oneway(*groups)
print(f"ANOVA F-statistic: {F_stat:.3f}, p-value: {p_val:.4f}")
print("Group Means:")
for q in ['Low', 'Mid', 'High']:
mean_attack = df[df['HP_Q'] == q]['Attack'].mean()
print(f" {q} HP Group Mean Attack: {mean_attack:.2f}")
print()
plt.figure()
sns.boxplot(x='HP_Q', y='Attack', data=df, order=['Low', 'Mid', 'High'])
plt.title("One-Way ANOVA: Attack by HP Quantile")
plt.xlabel("HP Group")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)
# === 28. 等分散性検定とWelchのt検定 + F検定と信頼区間 ===
print("[28] F-test for Equal Variances")
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
var1 = np.var(group1, ddof=1)
var2 = np.var(group2, ddof=1)
f_stat = var1 / var2 if var1 > var2 else var2 / var1
# 自由度とF分布の下限・上限臨界値
n1, n2 = len(group1), len(group2)
df1, df2 = n1 - 1, n2 - 1
f_crit_low = f.ppf(0.025, df1, df2)
f_crit_high = f.ppf(0.975, df1, df2)
# 母分散比の信頼区間計算
ci_f_lower = var1 / var2 / f_crit_high
ci_f_upper = var1 / var2 / f_crit_low
print(f"F-statistic (larger variance / smaller variance): {f_stat:.3f}")
print(f"95% Confidence Interval for Variance Ratio (σ1² / σ2²): ({ci_f_lower:.3f}, {ci_f_upper:.3f})")
print(f"Variance Group1 (HP ≥ 80): {var1:.2f}, n = {n1}")
print(f"Variance Group2 (HP < 80): {var2:.2f}, n = {n2}")
print("Interpretation: If CI includes 1, we cannot reject equal variances.\n")
# Welch's t-test(等分散を仮定しない2群平均差の検定)
print("[28] Welch's t-test (HP >= 80 vs HP < 80)")
t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
print(f"Welch's t-statistic: {t_stat:.3f}, p-value: {p_val:.4f}")
print(f"Group 1 Mean Attack (HP ≥ 80): {group1.mean():.2f}")
print(f"Group 2 Mean Attack (HP < 80): {group2.mean():.2f}")
print("Interpretation: If p < 0.05, the group means differ significantly.\n")
# 可視化(ボックスプロット)
plt.figure()
sns.boxplot(data=[group1, group2], notch=True)
plt.xticks([0, 1], ["HP ≥ 80", "HP < 80"])
plt.title("Attack Comparison by HP Group (Welch's t-test)")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# === 29. 一元配置分散分析 / One-Way ANOVA ===
print("[29] One-Way ANOVA on Attack grouped by HP Quantiles")
df['HP_Q'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
groups = [df[df['HP_Q'] == q]['Attack'] for q in ['Low', 'Mid', 'High']]
F_stat, p_val = f_oneway(*groups)
print(f"ANOVA F-statistic: {F_stat:.3f}, p-value: {p_val:.4f}")
print("Group Means (Attack):")
for q in ['Low', 'Mid', 'High']:
mean_attack = df[df['HP_Q'] == q]['Attack'].mean()
print(f" {q} HP Group: {mean_attack:.2f}")
print("Interpretation: If p < 0.05, at least one group mean differs.\n")
# 可視化(HP三分位によるAttackのBoxplot)
plt.figure()
sns.boxplot(x='HP_Q', y='Attack', data=df, order=['Low', 'Mid', 'High'])
plt.title("One-Way ANOVA: Attack by HP Quantile")
plt.xlabel("HP Group")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f, fisher_exact
# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
]
}
df = pd.DataFrame(pokemon)
# === 1. 2x2のクロス集計表とオッズ比 ===
# 仮の条件: HP >= 80 を "高HP"、Attack >= 100 を "高Attack" と定義
high_hp = df['HP'] >= 80
high_attack = df['Attack'] >= 100
# クロス集計表作成 / Contingency table
contingency = pd.crosstab(high_hp, high_attack)
print("[1] 2x2 Contingency Table (High HP vs High Attack):")
print(contingency, "\n")
# オッズ比の計算 / Odds Ratio
oddsratio, p_fisher = fisher_exact(contingency)
print(f"Odds Ratio: {oddsratio:.3f}")
print(f"Fisher's Exact Test p-value: {p_fisher:.4f}")
print("Interpretation: If OR > 1, high HP is positively associated with high Attack.\n")
# 可視化:クロス集計のヒートマップ
sns.heatmap(contingency, annot=True, fmt='d', cmap='Blues')
plt.title("2x2 Contingency Table: High HP vs High Attack")
plt.xlabel("High Attack")
plt.ylabel("High HP")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f, fisher_exact, binom, poisson, geom, multinomial, chisquare, spearmanr, mannwhitneyu, wilcoxon, kruskal, gamma
# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
],
'Defense': [
40, 78, 65, 49, 90, 50, 60, 80, 80, 45, 160, 20,
79, 95, 80, 53, 80, 83, 100, 55, 77, 73, 75, 110,
95, 70, 75, 180, 70, 115, 70, 85, 110, 120, 95, 95,
75, 85, 95, 70, 70, 55
],
'Speed': [
90, 100, 30, 45, 130, 55, 110, 55, 60, 120, 70, 20,
81, 80, 105, 87, 95, 80, 78, 110, 85, 60, 100, 30,
70, 100, 50, 70, 67, 75, 150, 55, 45, 40, 85, 110,
91, 50, 70, 70, 105, 60
]
}
df = pd.DataFrame(pokemon)
# --- プロット: HP vs Attack ---
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='HP', y='Attack')
plt.title('HP vs Attack of Pokémon')
plt.xlabel('HP')
plt.ylabel('Attack')
plt.grid(True)
plt.tight_layout()
plt.show()
# --- プロット: HP Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['HP'], bins=10, kde=True)
plt.title('Distribution of HP')
plt.xlabel('HP')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# --- プロット: Attack Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Attack'], bins=10, kde=True)
plt.title('Distribution of Attack')
plt.xlabel('Attack')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# --- プロット: Defense vs Speed ---
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Defense', y='Speed')
plt.title('Defense vs Speed of Pokémon')
plt.xlabel('Defense')
plt.ylabel('Speed')
plt.grid(True)
plt.tight_layout()
plt.show()
# --- プロット: Defense Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Defense'], bins=10, kde=True)
plt.title('Distribution of Defense')
plt.xlabel('Defense')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# --- プロット: Speed Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Speed'], bins=10, kde=True)
plt.title('Distribution of Speed')
plt.xlabel('Speed')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定・多変量解析(ロジスティック回帰・クラスタリング)を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
83, 75, 90, 90, 65, 52
],
'Attack': [
55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
80, 80, 85, 70, 100, 65
],
'Defense': [
40, 78, 65, 49, 90, 50, 60, 80, 80, 45, 160, 20,
79, 95, 80, 53, 80, 83, 100, 55, 77, 73, 75, 110,
95, 70, 75, 180, 70, 115, 70, 85, 110, 120, 95, 95,
75, 85, 95, 70, 70, 55
],
'Speed': [
90, 100, 30, 45, 130, 55, 110, 55, 60, 120, 70, 20,
81, 80, 105, 87, 95, 80, 78, 110, 85, 60, 100, 30,
70, 100, 50, 70, 67, 75, 150, 55, 45, 40, 85, 110,
91, 50, 70, 70, 105, 60
]
}
df = pd.DataFrame(pokemon)
# === 7-1. ロジスティック回帰分析(2クラス分類) ===
# 条件: Attack >= 100 を 1(高火力)とする
X_logit = df[['HP', 'Defense', 'Speed']]
y_logit = (df['Attack'] >= 100).astype(int)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_logit)
log_reg = LogisticRegression()
log_reg.fit(X_scaled, y_logit)
print("\n[7-1] Logistic Regression Coefficients:")
for col, coef in zip(X_logit.columns, log_reg.coef_[0]):
print(f" {col}: {coef:.3f}")
# === 7-4. 階層型クラスタリング(ヒートマップ+デンドログラム) ===
features = ['HP', 'Attack', 'Defense', 'Speed']
X_cluster = df[features]
X_scaled = StandardScaler().fit_transform(X_cluster)
linkage_matrix = linkage(X_scaled, method='ward')
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix, labels=df['Name'].values, leaf_rotation=90)
plt.title("[7-4] Hierarchical Clustering Dendrogram")
plt.xlabel("Pokémon")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()
# === 7-6. 非階層型クラスタリング(KMeans) ===
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
df['KMeansCluster'] = kmeans_labels
# PCA for 2D visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='KMeansCluster', palette='Set2')
plt.title('[7-6] KMeans Clustering with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定・多変量解析(ロジスティック回帰・クラスタリング)を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import ttest_ind, chi2_contingency, linregress, f_oneway
# --- Step 0: データ定義 / Pokémon Base Stats ---
# 省略(すでに記載済)...
# === 1-1. 箱ひげ図 ===
plt.figure()
sns.boxplot(data=df[['HP', 'Attack', 'Defense', 'Speed']])
plt.title("[1-1] Boxplot of Base Stats")
plt.tight_layout()
plt.show()
# === 1-2. 相関係数と散布図 ===
sns.pairplot(df[['HP', 'Attack', 'Defense', 'Speed']])
plt.suptitle("[1-2] Scatter Matrix of Stats", y=1.02)
plt.show()
# === 1-3. ヒストグラムと基本統計量 ===
print("\n[1-3] Descriptive Statistics")
print(df[['HP', 'Attack', 'Defense', 'Speed']].describe())
df[['HP', 'Attack', 'Defense', 'Speed']].hist(bins=10, figsize=(10, 6))
plt.suptitle("[1-3] Histograms of Base Stats")
plt.tight_layout()
plt.show()
# === 1-4. 積み上げ棒グラフ ===
stat_stack = df[['HP', 'Attack', 'Defense', 'Speed']].head(10)
stat_stack.index = df['Name'][:10]
stat_stack.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title("[1-4] Stacked Bar Graph of Stats (Top 10 Pokémon)")
plt.ylabel("Stat Total")
plt.tight_layout()
plt.show()
# === 1-5. 折れ線グラフ(種族値の推移例) ===
df_plot = df[['Name', 'HP', 'Attack', 'Defense', 'Speed']].set_index('Name').head(10).T
plt.figure(figsize=(10, 5))
for name in df_plot.columns:
plt.plot(df_plot.index, df_plot[name], marker='o', label=name)
plt.title("[1-5] Line Plot of Stats (Top 10 Pokémon)")
plt.xlabel("Stat Type")
plt.ylabel("Value")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# === 2-1. 対応のない2標本t検定 ===
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
t_stat, p_val = ttest_ind(group1, group2)
print(f"\n[2-1] Two-sample t-test (Attack by HP Group): t = {t_stat:.3f}, p = {p_val:.4f}")
# === 2-2. 独立性の検定 ===
df['HighSpeed'] = (df['Speed'] > 90)
df['HighAttack'] = (df['Attack'] > 100)
crosstab = pd.crosstab(df['HighSpeed'], df['HighAttack'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print(f"\n[2-2] Chi-square Test of Independence: chi2 = {chi2:.3f}, p = {p:.4f}")
# === 2-3. 単回帰分析 ===
slope, intercept, r_value, p_value, std_err = linregress(df['HP'], df['Attack'])
print(f"\n[2-3] Simple Linear Regression: Attack ~ HP")
print(f" Slope = {slope:.3f}, Intercept = {intercept:.3f}, R² = {r_value**2:.3f}, p = {p_value:.4f}")
# === 2-5. 一元配置分散分析 ===
df['HP_Group'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
g1 = df[df['HP_Group'] == 'Low']['Attack']
g2 = df[df['HP_Group'] == 'Mid']['Attack']
g3 = df[df['HP_Group'] == 'High']['Attack']
F_stat, p_val = f_oneway(g1, g2, g3)
print(f"\n[2-5] One-Way ANOVA: F = {F_stat:.3f}, p = {p_val:.4f}")
# Program Name: pokemon_stat_analysis_integrated.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学におけるデータの整理、推定・検定、多変量解析、クラスタリングまで幅広く網羅する統合分析プログラム。
# Usage: Run with `python pokemon_stat_analysis_integrated.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm, linregress, f_oneway, percentileofscore, zscore
# --- ポケモン種族値の定義 ---
pokemon = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Mewtwo', 'Bulbasaur'],
'HP': [35, 78, 160, 106, 45],
'Attack': [55, 84, 110, 110, 49],
'Defense': [40, 78, 65, 90, 49],
'Speed': [90, 100, 30, 130, 45]
}
df = pd.DataFrame(pokemon)
# --- 2-2: 基本統計とヒストグラム ---
print("基本統計量 / Descriptive Statistics")
print(df.describe())
stats = ['HP', 'Attack', 'Defense', 'Speed']
df[stats].hist(bins=10, figsize=(10, 6))
plt.suptitle("Histogram of Pokémon Base Stats")
plt.tight_layout()
plt.show()
# --- 散布図と相関 ---
print("\n相関係数行列 / Correlation Matrix")
print(df[stats].corr())
sns.pairplot(df[stats])
plt.suptitle("Scatter Matrix of Pokémon Base Stats", y=1.02)
plt.show()
# --- 箱ひげ図 ---
plt.figure()
sns.boxplot(data=df[stats])
plt.title("Boxplot of Pokémon Base Stats")
plt.tight_layout()
plt.show()
# --- 2-1: t検定(例:HPが80以上と未満のAttack差) ---
df['HP_Group'] = ['High' if hp >= 80 else 'Low' for hp in df['HP']]
group1 = df[df['HP_Group'] == 'High']['Attack']
group2 = df[df['HP_Group'] == 'Low']['Attack']
t_stat, p_val = ttest_ind(group1, group2)
print(f"\n[2-1] t検定(HP高低でのAttack差): t = {t_stat:.3f}, p = {p_val:.4f}")
# --- 2-1: z検定(母分散既知を仮定したHP平均の検定) ---
mu_hp = 100
sigma_hp = 30
xbar_hp = df['HP'].mean()
n_hp = len(df['HP'])
z = (xbar_hp - mu_hp) / (sigma_hp / np.sqrt(n_hp))
p_z = 2 * (1 - norm.cdf(abs(z)))
print(f"[2-1] z検定(HPの母平均100との比較): z = {z:.3f}, p = {p_z:.4f}")
# --- 2-3: 回帰分析(Attack ~ HP) ---
slope, intercept, r_value, p_value, std_err = linregress(df['HP'], df['Attack'])
print(f"\n[2-3] 回帰分析 Attack ~ HP: slope = {slope:.3f}, intercept = {intercept:.3f}, R² = {r_value**2:.3f}, p = {p_value:.4f}")
# --- 2-4: 一元配置分散分析(HP三分位によるAttack差) ---
df['HP_Tile'] = pd.qcut(df['HP'], q=3, labels=['Low', 'Mid', 'High'])
low = df[df['HP_Tile'] == 'Low']['Attack']
mid = df[df['HP_Tile'] == 'Mid']['Attack']
high = df[df['HP_Tile'] == 'High']['Attack']
F, p_anova = f_oneway(low, mid, high)
print(f"\n[2-4] 一元配置分散分析: F = {F:.3f}, p = {p_anova:.4f}")
# --- 2-5: ランダムサンプリング・順位・百分位数・標準化 ---
sample_df = df.sample(n=3, random_state=1)
print("\n[2-5] ランダムサンプリング:")
print(sample_df[['Name', 'HP', 'Attack']])
print("\n[2-5] HPの百分位数:")
df['HP_percentile'] = df['HP'].apply(lambda x: percentileofscore(df['HP'], x))
print(df[['Name', 'HP', 'HP_percentile']])
print("\n[2-5] Zスコア標準化:")
for col in stats:
df[f'{col}_z'] = zscore(df[col])
print(df[['Name'] + [f'{col}_z' for col in stats]])
# Program Name: pokemon_stat_analysis_integrated.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学におけるデータの整理、推定・検定、多変量解析、クラスタリング、数学的基礎(シグマ記号・微分・積分・指数・対数・不偏分散・自由度)まで幅広く網羅する統合分析プログラム。
# Usage: Run with `python pokemon_stat_analysis_integrated.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm, linregress, f_oneway, percentileofscore, zscore
# --- 数学的補足 / Mathematical Supplement ---
# シグマ(Σ): 合計演算
print("\n[数学補足] Σ記号による攻撃値の合計: ΣAttack =", np.sum(df['Attack']))
# 微分の基本(例: x² の微分)
from sympy import symbols, diff, integrate, log
x = symbols('x')
expr = x**2
derivative = diff(expr, x)
print("[数学補足] 微分 d/dx(x²) =", derivative)
# 積分の基本(例: x の積分)
integral = integrate(x, x)
print("[数学補足] 積分 ∫x dx =", integral)
# 指数と対数
print("[数学補足] 指数 e^2 =", np.exp(2))
print("[数学補足] 自然対数 log(10) =", np.log(10))
print("[数学補足] 常用対数 log10(1000) =", np.log10(1000))
# 標本分散と不偏分散
var_n = np.var(df['HP'], ddof=0) # 母分散
var_n1 = np.var(df['HP'], ddof=1) # 不偏分散
print("[数学補足] 標本分散(ddof=0)=", round(var_n, 3))
print("[数学補足] 不偏分散(ddof=1)=", round(var_n1, 3))
# 自由度(自由に変えられるデータ数)
n = len(df['HP'])
print("[数学補足] 自由度(n-1)=", n - 1)
# --- ポケモン種族値の定義 ---
pokemon = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Mewtwo', 'Bulbasaur'],
'HP': [35, 78, 160, 106, 45],
'Attack': [55, 84, 110, 110, 49],
'Defense': [40, 78, 65, 90, 49],
'Speed': [90, 100, 30, 130, 45]
}
df = pd.DataFrame(pokemon)
# (以下、省略:既存の統計処理と可視化コードはそのまま)