0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

コピペと統計2級で学ぶポケモン

Last updated at Posted at 2025-04-17

# Program Name: pokemon_statistical_analysis.py
# Creation Date: 20250417
# Overview: A program to analyze Pokémon data using histograms, boxplots, correlation, regression, time-series plots, and odds ratio
# Usage: To run the program, use the command `python pokemon_statistical_analysis.py` in the terminal

# --- ライブラリのインストール(Google Colab等で使用時に有効)/ Install required libraries ---
# !pip install pandas matplotlib seaborn scikit-learn statsmodels

# --- ライブラリのインポート / Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import statsmodels.api as sm

# === 数値と設定を一箇所に集約 / Centralized parameter definitions ===
# ポケモンのステータスデータ / Pokémon stats data
pokemon_data = {
    'No': [1, 2, 3, 6, 25, 26, 143, 149, 150],
    'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charizard', 'Pikachu', 'Raichu', 'Snorlax', 'Dragonite', 'Mewtwo'],
    'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
    'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
    'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
    'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130]
}
hp_column = 'HP'
attack_column = 'Attack'
defense_column = 'Defense'
speed_column = 'Speed'
stat_columns = [hp_column, attack_column, defense_column, speed_column]

# --- データフレームの作成 / Create DataFrame ---
df = pd.DataFrame(pokemon_data)

# === ヒストグラムと累積分布 Histogram & Cumulative Distribution ===
plt.figure(figsize=(10, 4))
plt.hist(df[hp_column], bins=5, alpha=0.7, edgecolor='black')
plt.title('Histogram of HP')
plt.xlabel('HP')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 4))
plt.hist(df[hp_column], bins=5, cumulative=True, alpha=0.7, edgecolor='black')
plt.title('Cumulative Distribution of HP')
plt.xlabel('HP')
plt.ylabel('Cumulative Frequency')
plt.grid(True)
plt.show()

# === 中心と散らばりの指標 / Central Tendency and Dispersion ===
mean_hp = df[hp_column].mean()
median_hp = df[hp_column].median()
std_hp = df[hp_column].std()
var_hp = df[hp_column].var()
range_hp = df[hp_column].max() - df[hp_column].min()
print(f"Mean HP: {mean_hp:.2f} / 平均")
print(f"Median HP: {median_hp:.2f} / 中央値")
print(f"Std Dev HP: {std_hp:.2f} / 標準偏差")
print(f"Variance HP: {var_hp:.2f} / 分散")
print(f"Range HP: {range_hp} / 範囲")

# === 要約統計量と箱ひげ図 / Summary Stats & Boxplot ===
summary_stats = df[stat_columns].describe()
print("\nSummary Statistics / 要約統計量:\n", summary_stats)

plt.figure(figsize=(10, 5))
sns.boxplot(data=df[stat_columns])
plt.title('Boxplot of Stats')
plt.ylabel('Stat Value')
plt.grid(True)
plt.show()

# === 散布図と相関係数 / Scatter Plot & Correlation ===
plt.figure(figsize=(8, 6))
sns.scatterplot(x=attack_column, y=hp_column, data=df)
plt.title('Scatter Plot: Attack vs HP')
plt.xlabel('Attack')
plt.ylabel('HP')
plt.grid(True)
plt.show()

corr, _ = pearsonr(df[attack_column], df[hp_column])
print(f"Pearson Correlation (Attack vs HP): {corr:.2f} / ピアソン相関係数")

# === 回帰直線と決定係数 / Regression Line & R² ===
X = df[[attack_column]]
y = df[hp_column]
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)

plt.figure(figsize=(8, 6))
plt.scatter(X, y, label='Data')
plt.plot(X, y_pred, color='red', label='Regression Line')
plt.title('Linear Regression: Attack vs HP')
plt.xlabel('Attack')
plt.ylabel('HP')
plt.legend()
plt.grid(True)
plt.show()
print(f"R-squared: {r2:.3f} / 決定係数")

# === 時系列データプロット / Time Series Description ===
plt.figure(figsize=(10, 4))
plt.plot(df['No'], df[hp_column], marker='o')
plt.title('Time Series-like Plot of HP by No')
plt.xlabel('Pokémon No')
plt.ylabel('HP')
plt.grid(True)
plt.show()

# === 自己相関(ラグ1) / Autocorrelation (Lag-1) ===
acf_vals = sm.tsa.acf(df[hp_column], nlags=3)
print(f"Autocorrelation (lags 0-3): {acf_vals} / 自己相関")

# === クロス集計とオッズ比 / Crosstab and Odds Ratio ===
df['High_HP'] = df[hp_column] >= df[hp_column].median()
df['High_Attack'] = df[attack_column] >= df[attack_column].median()
crosstab = pd.crosstab(df['High_HP'], df['High_Attack'])
print("\nCrosstab:\n", crosstab)

# オッズ比の計算 / Calculate Odds Ratio
try:
    odds_ratio = (crosstab.loc[True, True] * crosstab.loc[False, False]) / \
                 (crosstab.loc[True, False] * crosstab.loc[False, True])
    print(f"Odds Ratio: {odds_ratio:.2f} / オッズ比")
except ZeroDivisionError:
    print("Odds Ratio: Undefined due to division by zero. / ゼロ割により定義できません")

# プログラム名: pokemon_lln_clt_simulation.py
# Program Name: pokemon_lln_clt_simulation.py
# 内容: セクション3の主要トピック(大数の法則と中心極限定理)をポケモン色違いの確率で再現
# Description: Simulate LLN and CLT using the shiny Pokémon encounter rate

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# --- 共通設定 / Common Settings ---
np.random.seed(0)
shiny_prob = 1 / 4096  # 色違いポケモンの理論出現率 / Theoretical shiny Pokémon rate (~0.0244%)

# --- ① 大数の法則(LLN)/ Law of Large Numbers ---
# 色違いポケモンを捕まえ続けると、出現率が理論値に収束する様子を観察
# Observe how the shiny rate converges to the theoretical value as trials increase
trials = 10000  # 捕獲数 / Number of Pokémon caught
shiny_results = np.random.binomial(n=1, p=shiny_prob, size=trials)
shiny_rates = np.cumsum(shiny_results) / np.arange(1, trials + 1)

# --- ② 中心極限定理(CLT)/ Central Limit Theorem ---
# 各町のトレーナーが色違いを30匹捕獲し、その平均を集めたときの分布を確認
# Simulate shiny rate means from multiple towns (samples of size 30)
sample_size = 30
num_samples = 1000
shiny_means = [np.mean(np.random.binomial(1, shiny_prob, sample_size)) for _ in range(num_samples)]

# --- ③ 正規分布との比較 / Normal Distribution Overlay ---
# CLTによりサンプル平均の分布は正規分布に近づくことを確認
# According to CLT, sample means should approach a normal distribution
mean_theory = shiny_prob
std_theory = np.sqrt(shiny_prob * (1 - shiny_prob) / sample_size)
x = np.linspace(0, max(shiny_means) + 0.001, 1000)
y = norm.pdf(x, loc=mean_theory, scale=std_theory)

# --- プロット作成 / Create Plots ---
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# LLNプロット:色違い出現率の収束 / Convergence of shiny rate
ax[0].plot(shiny_rates, label='Sample Shiny Rate')
ax[0].axhline(shiny_prob, color='red', linestyle='--', label='True Shiny Rate (1/4096)')
ax[0].set_title('LLN: Shiny Pokémon Rate Converges to 1/4096')
ax[0].set_xlabel("Number of Pokémon Caught")
ax[0].set_ylabel("Observed Shiny Rate")
ax[0].legend()
ax[0].grid(True)

# CLTプロット:標本平均分布と正規分布 / Sample means and normal approximation
ax[1].hist(shiny_means, bins=30, density=True, alpha=0.7, color='gold', edgecolor='black', label='Sample Means')
ax[1].plot(x, y, 'r--', label='Normal Approximation')
ax[1].set_title("CLT: Shiny Rate Averages Across Towns (n=30)")
ax[1].set_xlabel("Town Average Shiny Rate")
ax[1].set_ylabel("Density")
ax[1].legend()
ax[1].grid(True)

plt.tight_layout()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, chi2, f

# --- プログラム名 / Program Name ---
# pokemon_confidence_intervals_combined.py
# ポケモンのデータを使って区間推定の例を示す
# Demonstrates confidence intervals using Pokémon data

# ------------------------------
# ① 母平均の区間推定 / Confidence Interval for Population Mean
# 例: ポケモンのHP / Example: HP of Pokémon
# ------------------------------
mu_hp = 60              # 仮定された母平均 / Assumed population mean
sigma_hp = 15           # 母標準偏差(既知)/ Known population std. dev.
n_hp = 36               # サンプル数 / Sample size
xbar_hp = 64            # 標本平均 / Sample mean

z = norm.ppf(0.975)     # 95%信頼区間のz値 / z-score for 95% CI
se_hp = sigma_hp / np.sqrt(n_hp)  # 標準誤差 / Standard error
ci_hp = (xbar_hp - z * se_hp, xbar_hp + z * se_hp)  # 信頼区間 / Confidence interval

# ------------------------------
# ② 母分散の区間推定 / Confidence Interval for Population Variance
# 例: ダメージのばらつき / Example: Damage variance
# ------------------------------
s2_dmg = 225            # 標本分散 / Sample variance
n_dmg = 36              # サンプルサイズ / Sample size
alpha = 0.05            # 有意水準 / Significance level
dof_dmg = n_dmg - 1     # 自由度 / Degrees of freedom

# カイ二乗分布を用いた信頼区間 / CI using chi-square distribution
ci_var = (
    (dof_dmg * s2_dmg) / chi2.ppf(1 - alpha / 2, dof_dmg),
    (dof_dmg * s2_dmg) / chi2.ppf(alpha / 2, dof_dmg)
)

# ------------------------------
# ③ 母比率の区間推定 / Confidence Interval for Proportion
# 例: 色違いの出現率 / Example: Shiny Pokémon appearance rate
# ------------------------------
x_shiny = 3             # 色違いの数 / Number of shinies
n_shiny = 1000          # 試行回数 / Number of encounters

p_hat = x_shiny / n_shiny                  # 標本比率 / Sample proportion
se_shiny = np.sqrt(p_hat * (1 - p_hat) / n_shiny)  # 標準誤差 / Standard error
ci_prop = (p_hat - z * se_shiny, p_hat + z * se_shiny)  # 信頼区間 / Confidence interval

# ------------------------------
# ④ 分散比の区間推定 / Confidence Interval for Variance Ratio
# 例: 炎技と水技の分散比 / Example: Fire vs Water move variance
# ------------------------------
s1, n1 = 10, 16     # 炎タイプの分散とサンプル数 / Fire-type
s2, n2 = 5, 21      # 水タイプの分散とサンプル数 / Water-type

# F分布を用いた信頼区間 / CI using F-distribution
ci_var_ratio = (
    (s1 / s2) / f.ppf(1 - alpha / 2, n1 - 1, n2 - 1),
    (s1 / s2) / f.ppf(alpha / 2, n1 - 1, n2 - 1)
)

# ------------------------------
# 結果出力 / Print Results
# ------------------------------
print("【ポケモンの区間推定まとめ / Pokémon Confidence Interval Summary】")
print(f"① 平均HPの区間推定 / Mean HP CI: {ci_hp}")
print(f"② ダメージ分散の区間推定 / Damage Variance CI: {ci_var}")
print(f"③ 色違い出現率の区間推定 / Shiny Appearance CI: {ci_prop}")
print(f"④ 技の分散比の区間推定(炎/水)/ Move Variance Ratio CI: {ci_var_ratio}")

# ------------------------------
# 可視化 / Visualization of Confidence Intervals
# ------------------------------
labels = ['HP Mean', 'Damage Variance', 'Shiny Rate', 'Variance Ratio']
lower_bounds = [ci_hp[0], ci_var[0], ci_prop[0], ci_var_ratio[0]]
upper_bounds = [ci_hp[1], ci_var[1], ci_prop[1], ci_var_ratio[1]]
means = [(l + u) / 2 for l, u in zip(lower_bounds, upper_bounds)]
errors = [(u - l) / 2 for l, u in zip(lower_bounds, upper_bounds)]

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(labels, means, yerr=errors, capsize=10, color='skyblue')
ax.set_title("Pokémon Confidence Intervals")  # グラフタイトル / Title
ax.set_ylabel("Estimated Value")             # y軸ラベル / y-axis label
ax.grid(True)
plt.tight_layout()
plt.show()


# Program Name: pokemon_probability_stats.py
# Creation Date: 20250417
# Overview: A program to analyze Pokémon data using probability, expectation, variance, distributions, and correlation
# Usage: To run the program, use the command `python pokemon_probability_stats.py` in the terminal

# --- 必要なライブラリのインストール(Google Colab等の場合有効化)/ Install required libraries ---
# !pip install pandas matplotlib seaborn scipy

# --- ライブラリのインポート / Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns

# === パラメータの一元管理 / Centralized parameter definitions ===
pokemon_data = {
    'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charizard', 'Pikachu', 'Raichu', 'Snorlax', 'Dragonite', 'Mewtwo'],
    'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
    'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
    'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90]
}

stat_target = 'Attack'    # 解析対象の変数 / Target variable for probability analysis
var_x = 'Attack'          # 相関用X軸変数 / Variable for x-axis (correlation)
var_y = 'Defense'         # 相関用Y軸変数 / Variable for y-axis (correlation)

# --- データフレーム作成 / Create DataFrame ---
df = pd.DataFrame(pokemon_data)

# === 期待値・分散・標準偏差の計算 / Calculate Expectation, Variance, Std ===
mean_val = df[stat_target].mean()      # 平均値 / Mean
var_val = df[stat_target].var()        # 分散 / Variance
std_val = df[stat_target].std()        # 標準偏差 / Standard Deviation

# --- Zスコア計算 / Calculate Z-score (Standardization) ---
df[f'{stat_target}_z'] = (df[stat_target] - mean_val) / std_val

# --- 正規分布とヒストグラムのプロット / Histogram and Normal PDF ---
x_range = np.linspace(df[stat_target].min(), df[stat_target].max(), 100)
pdf_values = norm.pdf(x_range, loc=mean_val, scale=std_val)

plt.figure(figsize=(8, 4))
plt.hist(df[stat_target], bins=5, density=True, alpha=0.6, edgecolor='black', label=f'{stat_target} Histogram')
plt.plot(x_range, pdf_values, color='red', label='Normal Approximation')
plt.title(f'{stat_target} Distribution of Pokémon')
plt.xlabel(stat_target)
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# --- 相関関係の散布図 / Scatter plot for correlation ---
plt.figure(figsize=(8, 6))
sns.scatterplot(x=var_x, y=var_y, data=df)
plt.title(f'{var_x} vs {var_y}')
plt.xlabel(var_x)
plt.ylabel(var_y)
plt.grid(True)
plt.show()

# --- 共分散と相関係数 / Covariance and Correlation Coefficient ---
covariance = df[[var_x, var_y]].cov().iloc[0, 1]
correlation = df[[var_x, var_y]].corr().iloc[0, 1]

# --- 結果表示 / Display results ---
print(f"Mean of {stat_target} / 平均: {mean_val:.2f}")
print(f"Variance of {stat_target} / 分散: {var_val:.2f}")
print(f"Standard Deviation of {stat_target} / 標準偏差: {std_val:.2f}")
print(f"Covariance between {var_x} and {var_y} / 共分散: {covariance:.2f}")
print(f"Correlation Coefficient / 相関係数: {correlation:.2f}")
print("\nZ-scored DataFrame:")
print(df[[stat_target, f'{stat_target}_z']])

# プログラム名: pokemon_ci_diff_proportion.py
# 内容: 炎タイプと水タイプで「ひるませ技」成功率の差の区間推定
# Description: Confidence interval for the difference in flinch move success rate between Fire-type and Water-type Pokémon

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# --- データ定義 / Define sample data ---
x1, n1 = 16, 80   # 炎タイプ: 成功回数 / 試行数 (Fire-type: successes / trials)
x2, n2 = 9, 90    # 水タイプ: 成功回数 / 試行数 (Water-type: successes / trials)

# --- 標本比率 / Sample proportions ---
p1 = x1 / n1      # 炎タイプの成功率 / Fire success rate
p2 = x2 / n2      # 水タイプの成功率 / Water success rate
diff = p1 - p2    # 成功率の差 / Difference in proportions

# --- 標準誤差とZ値 / Standard error and z-value ---
se = np.sqrt((p1 * (1 - p1)) / n1 + (p2 * (1 - p2)) / n2)
z = norm.ppf(0.975)  # 95%信頼区間に対応するz値 / z-score for 95% CI

# --- 区間推定 / Compute confidence interval ---
ci_lower = diff - z * se
ci_upper = diff + z * se

# --- 結果表示 / Print results ---
print("🔥 Fire-Type Success Rate:", round(p1, 3))
print("💧 Water-Type Success Rate:", round(p2, 3))
print("📏 Difference (Fire - Water):", round(diff, 3))
print("🧾 95% Confidence Interval:", f"[{ci_lower:.3f}, {ci_upper:.3f}]")

# --- グラフ表示 / Visualization ---
fig, ax = plt.subplots(figsize=(8, 5))

# バーのラベルと値 / Labels and bar heights
labels = ['Fire', 'Water', 'Difference']
values = [p1, p2, diff]
errors = [z * np.sqrt(p1 * (1 - p1) / n1), z * np.sqrt(p2 * (1 - p2) / n2), z * se]

# 棒グラフ + エラーバー / Bar chart with error bars
bars = ax.bar(labels, values, yerr=errors, capsize=10, color=['orangered', 'dodgerblue', 'gray'])

# 0ラインを描画 / Horizontal line at y=0
ax.axhline(0, color='black', linestyle='--', linewidth=1)

# グラフの装飾 / Chart styling
ax.set_ylabel("Success Rate / Difference")
ax.set_title("Confidence Interval for Flinch Move Success Rate\n(Fire vs Water Type Pokémon)")
ax.grid(True, linestyle=':', alpha=0.7)
plt.tight_layout()
plt.show()



# プログラム名: pokemon_test_diff_proportion.py
# 内容: 母比率の差に関する仮説検定(2標本問題)
# Description: Hypothesis test for difference in proportions (Fire vs Water Pokémon)

import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

# --- 標本データ / Sample data ---
x1, n1 = 16, 80  # 炎タイプ: 成功回数 / 試行数 (Fire-type: successes / trials)
x2, n2 = 9, 90   # 水タイプ: 成功回数 / 試行数 (Water-type: successes / trials)

# --- 標本比率 / Sample proportions ---
p1 = x1 / n1     # Fire success rate
p2 = x2 / n2     # Water success rate

# --- プールされた比率 / Pooled proportion ---
# 帰無仮説: p1 = p2 のもとで全体の成功率を計算
# Under H0: p1 = p2, compute pooled success rate
p_pooled = (x1 + x2) / (n1 + n2)

# --- 標準誤差(プール法)/ Standard error under H0 (pooled SE) ---
se_pooled = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))

# --- Z値の計算 / Calculate Z-statistic ---
z = (p1 - p2) / se_pooled

# --- 両側検定のP値 / Two-tailed p-value ---
p_value = 2 * (1 - norm.cdf(abs(z)))

# --- 結果表示 / Display results ---
print("🔥 Fire-Type Success Rate:", round(p1, 3))
print("💧 Water-Type Success Rate:", round(p2, 3))
print("📊 Z-score:", round(z, 3))
print("📉 P-value:", round(p_value, 4))

# --- 結論 / Hypothesis decision ---
alpha = 0.05  # 有意水準 / Significance level
if p_value < alpha:
    print("✅ 結論: 有意差あり(帰無仮説棄却) / Significant difference (Reject H0)")
else:
    print("❎ 結論: 有意差なし(帰無仮説を棄却できない) / No significant difference (Fail to reject H0)")

# --- プロット / Visual representation of the test ---
x = np.linspace(-4, 4, 500)
y = norm.pdf(x)

plt.figure(figsize=(10, 5))
plt.plot(x, y, label="Standard Normal Distribution")  # 標準正規分布 / Standard normal curve
plt.axvline(z, color='red', linestyle='--', label=f"Z = {z:.2f}")  # Z値の線 / Z-score line
plt.fill_between(x, y, where=(abs(x) > abs(z)), color='orange', alpha=0.4, label="Rejection Region")  # 棄却域 / Rejection region
plt.title("Hypothesis Test for Difference in Proportions\n(Fire vs Water Pokémon)")
plt.xlabel("Z-value")
plt.ylabel("Density")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# --- プログラム名: pokemon_chi_square_tests_combined.py ---
# 内容: 適合度の検定と独立性の検定を統合し、グラフ付きで表示するポケモン統計プログラム

# ------------------------------
# 適合度の検定 / Goodness-of-Fit
# ------------------------------
# 観測データ:通常色と色違い
observed_gof = np.array([9998, 2])
expected_ratio = np.array([1 - 1/4096, 1/4096])
expected_gof = expected_ratio * observed_gof.sum()

# カイ二乗適合度の検定
chi2_stat, p_value_gof = stats.chisquare(f_obs=observed_gof, f_exp=expected_gof)

# ------------------------------
# 独立性の検定 / Test of Independence
# ------------------------------
# クロス集計データ:タイプ × 状態異常
data_indep = pd.DataFrame({
    'Burn': [30, 5],
    'Paralyze': [10, 25]
}, index=['Fire', 'Water'])

chi2_indep, p_value_indep, dof, expected_indep = stats.chi2_contingency(data_indep)

# ------------------------------
# 結果表示 / Display Results
# ------------------------------
print("【適合度の検定 / Goodness-of-Fit Test】")
print("観測値:", observed_gof)
print("期待値:", expected_gof.round(2))
print(f"Chi2統計量: {chi2_stat:.3f}, P値: {p_value_gof:.5f}")
if p_value_gof < 0.05:
    print("✅ 結論: 理論比率と一致しない(有意差あり)")
else:
    print("❎ 結論: 理論比率と一致(有意差なし)")

print("\n【独立性の検定 / Test of Independence】")
print("観測データ:\n", data_indep)
print("期待度数:\n", pd.DataFrame(expected_indep, index=data_indep.index, columns=data_indep.columns).round(2))
print(f"Chi2統計量: {chi2_indep:.3f}, P値: {p_value_indep:.4f}")
if p_value_indep < 0.05:
    print("✅ 結論: タイプと状態異常に関係あり(独立でない)")
else:
    print("❎ 結論: タイプと状態異常は独立(関係なし)")

# ------------------------------
# プロット / Plot
# ------------------------------
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# 適合度の検定:棒グラフ
labels = ['Normal', 'Shiny']
x = np.arange(len(labels))
width = 0.35
ax[0].bar(x - width/2, observed_gof, width, label='Observed', alpha=0.7)
ax[0].bar(x + width/2, expected_gof, width, label='Expected', alpha=0.7)
ax[0].set_title('Goodness-of-Fit: Shiny Rate')
ax[0].set_xticks(x)
ax[0].set_xticklabels(labels)
ax[0].set_ylabel("Count")
ax[0].legend()
ax[0].grid(True)

# 独立性の検定:クロス集計
data_indep.plot(kind='bar', ax=ax[1])
ax[1].set_title('Independence Test: Type vs Status')
ax[1].set_ylabel("Count")
ax[1].legend(title="Status")
ax[1].grid(True)

plt.tight_layout()
plt.show()

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# --- プログラム名: pokemon_hypothesis_tests_with_plot.py ---
# 内容: ポケモンのデータを使った仮説検定(平均・分散・比率・差)とP値の可視化
# Description: Hypothesis testing (mean, variance, proportion, difference) with Pokémon data

# ------------------------------
# ① 母平均の仮説検定(1標本) / One-sample test for mean
sample_hp = np.array([62, 58, 59, 63, 61, 60, 64, 59, 62, 58])
mu_hp = 60  # 仮定された母平均 / Hypothesized population mean
x_bar = sample_hp.mean()
s = sample_hp.std(ddof=1)
n = len(sample_hp)
t_stat = (x_bar - mu_hp) / (s / np.sqrt(n))
p_val_mean = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 1))

# ------------------------------
# ② 母分散の仮説検定(1標本) / One-sample test for variance
s2 = sample_hp.var(ddof=1)
chi2_stat = (n - 1) * s2 / 100  # 仮定された母分散 = 100
p_val_var = 2 * min(
    stats.chi2.cdf(chi2_stat, df=n - 1),
    1 - stats.chi2.cdf(chi2_stat, df=n - 1)
)

# ------------------------------
# ③ 母比率の仮説検定(1標本) / One-sample test for proportion
x_shiny = 4
n_shiny = 8000
p0 = 1 / 4096
p_hat = x_shiny / n_shiny
se = np.sqrt(p0 * (1 - p0) / n_shiny)
z_stat = (p_hat - p0) / se
p_val_prop = 2 * (1 - stats.norm.cdf(abs(z_stat)))

# ------------------------------
# ④ 母平均の差の仮説検定(2標本) / Two-sample test for mean difference
atk_pre = np.array([50, 48, 52, 51, 49])
atk_post = np.array([60, 58, 63, 61, 59])
t2_stat, p_val_diff = stats.ttest_ind(atk_pre, atk_post)

# ------------------------------
# ⑤ 母分散比の仮説検定(2標本) / Two-sample test for variance ratio
var1 = np.var([40, 42, 43, 38, 41], ddof=1)
var2 = np.var([28, 30, 29, 31, 27], ddof=1)
f_stat = var1 / var2
df1, df2 = 4, 4
p_val_f = 2 * min(
    stats.f.cdf(f_stat, df1, df2),
    1 - stats.f.cdf(f_stat, df1, df2)
)

# ------------------------------
# 結果表示 / Print Results
print("【ポケモンの仮説検定まとめ / Pokémon Hypothesis Test Summary】")
print(f"① 平均HP = 60?: t = {t_stat:.3f}, p = {p_val_mean:.4f}")
print(f"② 分散 = 100?: χ² = {chi2_stat:.3f}, p = {p_val_var:.4f}")
print(f"③ 色違い率 = 1/4096?: z = {z_stat:.3f}, p = {p_val_prop:.4f}")
print(f"④ 進化前後の攻撃力差: t = {t2_stat:.3f}, p = {p_val_diff:.4f}")
print(f"⑤ 技ばらつき差(炎/水): F = {f_stat:.3f}, p = {p_val_f:.4f}")

# ------------------------------
# P値の可視化 / P-value Visualization
labels = [
    "HP = 60 ?",
    "Variance = 100 ?",
    "Shiny Rate = 1/4096 ?",
    "Attack Before vs After",
    "Variance Fire vs Water"
]
p_values = [p_val_mean, p_val_var, p_val_prop, p_val_diff, p_val_f]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]

plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Hypothesis Test Results for Pokémon Data")
plt.ylabel("P-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()

# 各バーの上にP値を表示 / Annotate p-values above bars
for bar, p in zip(bars, p_values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
             ha='center', va='bottom')

plt.tight_layout()
plt.show()


# 前のセルがリセットされたので再定義が必要

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew, kurtosis
import pprint

# --- データの準備 / Prepare simplified Pokémon data ---
data = {
    "名前": ["フシギダネ", "フシギソウ", "フシギバナ", "ヒトカゲ", "リザード", "リザードン", "ゼニガメ", "カメール", "カメックス", "ミュウツー", "ミュウ"],
    "HP": [45, 60, 80, 39, 58, 78, 44, 59, 79, 106, 100],
    "攻撃": [49, 62, 82, 52, 64, 84, 48, 63, 83, 110, 100],
    "防御": [49, 63, 83, 43, 58, 78, 65, 80, 100, 90, 100],
    "素早": [45, 60, 80, 65, 80, 100, 43, 58, 78, 130, 100],
    "特殊": [65, 80, 100, 50, 65, 85, 50, 65, 85, 154, 100],
    "合計": [253, 325, 425, 249, 325, 425, 250, 325, 425, 590, 500]
}

df = pd.DataFrame(data)

# --- 統計量の計算 / Compute Descriptive Statistics ---
stats_columns = ["HP", "攻撃", "防御", "素早", "特殊", "合計"]
summary = {
    "平均 / Mean": df[stats_columns].mean().to_dict(),
    "中央値 / Median": df[stats_columns].median().to_dict(),
    "最頻値 / Mode": df[stats_columns].mode().iloc[0].to_dict(),
    "標準偏差 / Std Dev": df[stats_columns].std().to_dict(),
    "変動係数 / Coef of Var": (df[stats_columns].std() / df[stats_columns].mean()).to_dict(),
    "歪度 / Skewness": df[stats_columns].apply(skew).to_dict(),
    "尖度 / Kurtosis": df[stats_columns].apply(kurtosis).to_dict()
}

# --- 結果出力 / Display Summary ---
pprint.pprint(summary)
# プログラム名: pokemon_regression_anova_combo.py
# 内容: ポケモンのステータスで回帰分析(単回帰・重回帰)と一元配置分散分析(ANOVA)を行う

import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# ------------------------------
# データ作成 / Create Pokémon sample data
data = {
    'Name': ['フシギダネ', 'ヒトカゲ', 'ゼニガメ', 'ピカチュウ', 'ライチュウ', 'サンドパン', 'ピクシー', 'ガーディ', 'ウインディ', 'ニョロボン'],
    'HP':     [45, 39, 44, 35, 60, 75, 95, 55, 90, 90],
    'Attack': [49, 52, 48, 55, 90, 100, 70, 70, 110, 95],
    'Defense':[49, 43, 65, 40, 55, 110, 73, 45, 80, 95],
    'Sp.Atk': [65, 60, 50, 50, 90, 45, 95, 70, 100, 70],
    'Sp.Def': [65, 50, 64, 50, 80, 55, 90, 50, 80, 90],
    'Speed':  [45, 65, 43, 90, 110, 65, 60, 60, 95, 70],
    'Type':   ['Grass', 'Fire', 'Water', 'Electric', 'Electric', 'Ground', 'Fairy', 'Fire', 'Fire', 'Water']
}
df = pd.DataFrame(data)

# ------------------------------
# 単回帰分析 / Simple Linear Regression: Attack ~ HP
X1 = sm.add_constant(df['HP'])  # 定数項追加 / Add intercept
y = df['Attack']
model1 = sm.OLS(y, X1).fit()

# ------------------------------
# 重回帰分析 / Multiple Linear Regression: Attack ~ HP + Sp.Atk + Speed
X2 = df[['HP', 'Sp.Atk', 'Speed']]
X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2).fit()

# ------------------------------
# 一元配置分散分析 / One-Way ANOVA: Attack ~ Type
anova_model = sm.OLS.from_formula("Attack ~ C(Type)", data=df).fit()
anova_table = sm.stats.anova_lm(anova_model, typ=2)

# ------------------------------
# 結果表示 / Print summaries
print("=== 単回帰分析: Attack ~ HP ===")
print(model1.summary())
print("\n=== 重回帰分析: Attack ~ HP + Sp.Atk + Speed ===")
print(model2.summary())
print("\n=== 一元配置分散分析(Attack ~ Type) ===")
print(anova_table)

# ------------------------------
# 可視化(散布図と回帰直線)/ Visualization
plt.figure(figsize=(8, 5))
sns.regplot(x='HP', y='Attack', data=df, ci=95)
plt.title("Simple Linear Regression: Attack ~ HP")
plt.grid(True)
plt.tight_layout()
plt.show()

# ------------------------------
# 決定係数と調整済み決定係数の比較 / R² and Adjusted R²
r2 = model2.rsquared
r2_adj = model2.rsquared_adj
print(f"\n決定係数 R²: {r2:.4f}")
print(f"自由度調整済み決定係数 R²_adj: {r2_adj:.4f}")

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# --- プログラム名: pokemon_statistics_module_s3to5.py ---
# 内容: セクション3〜5の主要統計処理(確率、推定、仮説検定)とP値プロットの統合

# ------------------------------
# セクション3:確率と分布(例:色違いポケモン)
p_shiny = 1 / 4096
n_trials = 8000
x_shiny = 4
p_hat = x_shiny / n_trials
se_shiny = np.sqrt(p_shiny * (1 - p_shiny) / n_trials)
z_shiny = (p_hat - p_shiny) / se_shiny
p_val_shiny = 2 * (1 - stats.norm.cdf(abs(z_shiny)))

# ------------------------------
# セクション4:区間推定(例:HPの標本平均)
sample_hp = np.array([62, 58, 59, 63, 61, 60, 64, 59, 62, 58])
mu_hp = 60
mean_hp = sample_hp.mean()
std_hp = sample_hp.std(ddof=1)
n = len(sample_hp)
se_hp = std_hp / np.sqrt(n)
t_hp = (mean_hp - mu_hp) / se_hp
p_val_hp = 2 * (1 - stats.t.cdf(abs(t_hp), df=n - 1))

# ------------------------------
# セクション5:仮説検定(例:進化前後の攻撃力)
atk_pre = np.array([50, 48, 52, 51, 49])
atk_post = np.array([60, 58, 63, 61, 59])
t_stat, p_val_attack = stats.ttest_ind(atk_pre, atk_post)

# ------------------------------
# 分散比検定(例:技のばらつき)
var1 = np.var([40, 42, 43, 38, 41], ddof=1)
var2 = np.var([28, 30, 29, 31, 27], ddof=1)
f_stat = var1 / var2
df1, df2 = 4, 4
p_val_var = 2 * min(stats.f.cdf(f_stat, df1, df2), 1 - stats.f.cdf(f_stat, df1, df2))

# ------------------------------
# 可視化:P値のまとめ
labels = ['Shiny Rate Test', 'HP Mean Test', 'Attack Diff Test', 'Variance Ratio Test']
p_values = [p_val_shiny, p_val_hp, p_val_attack, p_val_var]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]

plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Summary of Hypothesis Tests (Sections 3 to 5)")
plt.ylabel("P-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()

for bar, p in zip(bars, p_values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

# --- Re-defining variables due to NameError ---
# Sample Pokémon data
pokemon_data = {
    'Name': ['Bulbasaur', 'Charmander', 'Squirtle', 'Pikachu', 'Sandshrew', 'Meowth', 'Psyduck', 'Mankey', 'Growlithe', 'Geodude'],
    'Attack': [49, 52, 48, 55, 75, 45, 52, 80, 70, 80],
    'HP':     [45, 39, 44, 35, 50, 40, 50, 40, 55, 40],
    'Type':   ['Grass', 'Fire', 'Water', 'Electric', 'Ground', 'Normal', 'Water', 'Fighting', 'Fire', 'Rock']
}
df = pd.DataFrame(pokemon_data)

# Hypothesis Test 1: Is mean attack = 60?
mu_attack = 60
x_bar = df['Attack'].mean()
s = df['Attack'].std(ddof=1)
n = len(df)
t_stat = (x_bar - mu_attack) / (s / np.sqrt(n))
p_attack = 2 * (1 - stats.t.cdf(abs(t_stat), df=n - 1))

# Hypothesis Test 2: Is Pikachu's HP lower than the overall mean?
mu_hp = df['HP'].mean()
pikachu_hp = df[df['Name'] == 'Pikachu']['HP'].values[0]
p_hp = stats.norm.cdf(pikachu_hp, loc=mu_hp, scale=df['HP'].std(ddof=1))

# Hypothesis Test 3: Attack comparison between Fire and Water types
atk_fire = df[df['Type'] == 'Fire']['Attack']
atk_water = df[df['Type'] == 'Water']['Attack']
t_type, p_type = stats.ttest_ind(atk_fire, atk_water)

# Hypothesis Test 4: Is attack variance = 100?
s2_attack = df['Attack'].var(ddof=1)
chi2_stat = (n - 1) * s2_attack / 100
p_var = 2 * min(stats.chi2.cdf(chi2_stat, df=n - 1), 1 - stats.chi2.cdf(chi2_stat, df=n - 1))

# --- Plot with English labels and title ---
labels = [
    'Attack Mean = 60',
    'Pikachu HP < Mean HP',
    'Fire vs Water Attack',
    'Attack Variance = 100'
]
p_values = [p_attack, p_hp, p_type, p_var]
colors = ['green' if p >= 0.05 else 'red' for p in p_values]

plt.figure(figsize=(10, 6))
bars = plt.bar(labels, p_values, color=colors)
plt.axhline(0.05, color='blue', linestyle='--', label='Significance Level (0.05)')
plt.title("Hypothesis Tests for Pokémon (Gen 1)")
plt.ylabel("p-value")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.legend()

# Display p-values above bars
for bar, p in zip(bars, p_values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{p:.3f}",
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

# プログラム名: pokemon_multi_hit_move_expectation.py
# 内容: 幾何分布に基づくポケモンの連続技の命中回数と期待威力を計算

import numpy as np
import matplotlib.pyplot as plt

# --- 設定 / Parameters ---
p = 0.1             # 各ヒットの失敗確率(miss after hit)
power = 20          # 一撃あたりの威力
max_hits = 10       # 最大ヒット回数(例:ねずみざん)
prob = np.zeros(max_hits + 1)

# --- 幾何分布の上限付きシミュレーション / Capped geometric distribution ---
prob[0] = p  # 最初の失敗(1発も当たらない)
for i in range(1, max_hits):
    prob[i] = (1 - p)**i * p
prob[max_hits] = 1.0 - np.sum(prob[:-1])  # 最後に補正

# --- 表示 / Print result ---
for i in range(max_hits + 1):
    print(f"Prob {i} Hits: {prob[i]:.3f}")

# --- 期待値計算 / Expected value of power ---
num_hits = np.arange(0, max_hits + 1)
expected_damage = np.sum(power * num_hits * prob)
print(f"\nExpected total power: {expected_damage:.2f}")

# --- 可視化 / Plot ---
plt.figure(figsize=(10, 5))
plt.bar(num_hits, prob, color='skyblue')
plt.xlabel("Number of Hits")
plt.ylabel("Probability")
plt.title(f"Hit Distribution of Multi-Hit Move (max {max_hits}, p_miss = {p})")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
# Program Name: pokemon_statistical_analysis_advanced.py
# Creation Date: 20250419
# Overview: Advanced statistical and visual analysis of Pokémon base stats
# Usage: Run with `python pokemon_statistical_analysis_advanced.py`

# --- ライブラリのインストール ---
# !pip install pandas matplotlib seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# --- データ定義 ---
data = {
    'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Gengar', 'Machamp'],
    'HP': [35, 78, 160, 60, 90],
    'Attack': [55, 84, 110, 65, 130],
    'Defense': [40, 78, 65, 60, 80],
    'Speed': [90, 100, 30, 110, 55],
    'Special': [50, 85, 65, 130, 65]
}
df = pd.DataFrame(data)
df['Total'] = df[['HP', 'Attack', 'Defense', 'Speed', 'Special']].sum(axis=1)

# === ⑤ 散布図と相関分析 / Scatterplot and correlation ===
plt.figure(figsize=(6, 5))
sns.scatterplot(x='Attack', y='Speed', data=df, s=100)
corr = df['Attack'].corr(df['Speed'])
plt.title(f"Attack vs Speed (corr = {corr:.2f})")
plt.xlabel("Attack")
plt.ylabel("Speed")
plt.grid(True)
plt.tight_layout()
plt.show()

# === ⑥ 主成分分析 (PCA) / PCA to visualize multi-dimensional stats ===
features = ['HP', 'Attack', 'Defense', 'Speed', 'Special']
X = df[features]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], s=100)
for i, name in enumerate(df['Name']):
    plt.text(X_pca[i, 0]+0.01, X_pca[i, 1], name)
plt.title("PCA of Pokémon Stats")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()

# === ⑦ KMeansクラスタリング / Cluster Pokémon by stats ===
kmeans = KMeans(n_clusters=2, random_state=0)
df['Cluster'] = kmeans.fit_predict(X)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Cluster'], s=100, palette='Set2')
for i, name in enumerate(df['Name']):
    plt.text(X_pca[i, 0]+0.01, X_pca[i, 1], name)
plt.title("Clustered Pokémon by Stats (PCA Space)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.tight_layout()
plt.show()

# === ⑧ レーダーチャート / Radar chart for each Pokémon ===
def radar_chart(row, features, title):
    values = row[features].tolist()
    values += values[:1]  # to close the radar circle
    angles = np.linspace(0, 2*np.pi, len(features)+1)

    plt.figure(figsize=(5, 5))
    ax = plt.subplot(111, polar=True)
    ax.plot(angles, values, 'o-', linewidth=2)
    ax.fill(angles, values, alpha=0.25)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(features)
    ax.set_title(title)
    plt.tight_layout()
    plt.show()

# レーダーチャートで全ポケモン可視化
for i, row in df.iterrows():
    radar_chart(row, features, f"{row['Name']}'s Base Stats")

# プログラム名: pokemon_data_analysis_all_in_one.py
# Program Name: pokemon_data_analysis_all_in_one.py
# 作成日 / Creation Date: 20250420
# 概要 / Overview: ポケモンの種族値データを用いて、データ集計、グラフ化、代表値、ばらつき、標準化、相関、確率計算までを一括実行するプログラム。
# 使い方 / Usage: ターミナルで `python pokemon_data_analysis_all_in_one.py` を実行

# --- 必要なライブラリのインストール / Install required libraries ---
# !pip install pandas matplotlib seaborn numpy scipy

# --- ライブラリのインポート / Import required libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# --- 数値定義 / Centralized parameter definitions ---
pokemon_data = {
    'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
    'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
    'Defense': [40, 78, 65, 49, 90, 50, 60, 70],
    'Speed': [90, 100, 30, 45, 130, 55, 110, 55],
    'HP': [35, 78, 160, 45, 106, 55, 60, 90]
}
df = pd.DataFrame(pokemon_data)

# === 1. データの集計 / Data Aggregation ===
# 1-1 データ表示 / Show raw data
print(df)

# 1-2 平均/中央値などの統計値 / Basic statistics (mean, median)
desc_stats = df.describe()
print(desc_stats)

# 1-3 グラフ表示:種族値の棒グラフ / Bar chart of base stats
df.set_index('Name')[['Attack', 'Defense', 'Speed', 'HP']].plot(kind='bar')
plt.title('Base Stats of Pokémon')
plt.ylabel('Stat Value')
plt.xlabel('Pokémon')
plt.legend(title='Attributes')
plt.tight_layout()
plt.show()

# === 2. さまざまなグラフ / Various Charts ===
# 2-1 クロス集計(速度と攻撃力)/ Crosstab (Speed category vs Attack category)
df['Speed_Level'] = pd.cut(df['Speed'], bins=[0, 50, 100, 150], labels=['Slow', 'Medium', 'Fast'])
df['Attack_Level'] = pd.cut(df['Attack'], bins=[0, 60, 100, 150], labels=['Low', 'Medium', 'High'])
cross_tab = pd.crosstab(df['Speed_Level'], df['Attack_Level'])
print(cross_tab)

# 2-2 モザイク図 / Mosaic plot (optional with statsmodels or external lib)

# 2-3 積み上げ棒グラフ / Stacked bar (Attack + Defense)
df_plot = df.set_index('Name')[['Attack', 'Defense']]
df_plot.plot(kind='bar', stacked=True)
plt.title('Attack and Defense by Pokémon')
plt.ylabel('Stat Total')
plt.xlabel('Pokémon')
plt.legend(title='Attributes')
plt.tight_layout()
plt.show()

# === 3. 時系列データ(サンプル)/ Time Series Example ===
# 3-1 架空の時系列データを生成 / Generate fake time series (e.g., HP over time)
time = pd.date_range(start='2025-01-01', periods=8, freq='M')
hp_time_df = pd.DataFrame({'Date': time, 'HP': df['HP'].values})
hp_time_df.set_index('Date')['HP'].plot(marker='o')
plt.title('HP Over Time')
plt.ylabel('HP')
plt.xlabel('Date')
plt.grid(True)
plt.tight_layout()
plt.show()

# === 4. 代表値と箱ひげ図 / Representative Values and Boxplot ===
mean_hp = df['HP'].mean()
median_hp = df['HP'].median()
mode_hp = df['HP'].mode()[0]
print(f"Mean HP: {mean_hp}, Median HP: {median_hp}, Mode HP: {mode_hp}")

# 箱ひげ図 / Boxplot
sns.boxplot(data=df[['Attack', 'Defense', 'Speed', 'HP']])
plt.title('Boxplot of Pokémon Stats')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

# === 5. データのばらつき / Data Dispersion ===
# 分散・標準偏差・変動係数 / Variance, Std, CV
for col in ['Attack', 'Defense', 'Speed', 'HP']:
    var = df[col].var()
    std = df[col].std()
    cv = std / df[col].mean()
    print(f"{col}: Variance={var:.2f}, StdDev={std:.2f}, CV={cv:.2f}")

# === 6. データの標準化 / Data Standardization ===
# レーダーチャート用標準化 / Normalize for radar chart
df_norm = df.copy()
cols = ['Attack', 'Defense', 'Speed', 'HP']
df_norm[cols] = (df[cols] - df[cols].mean()) / df[cols].std()

# 偏差値 / Deviation value calculation
df['HP_deviation'] = 50 + 10 * (df['HP'] - df['HP'].mean()) / df['HP'].std()
print(df[['Name', 'HP', 'HP_deviation']])

# === 7. データの相関 / Correlation Analysis ===
plt.scatter(df['Attack'], df['Speed'], s=df['HP'], alpha=0.5)
plt.title('Bubble Chart: Attack vs Speed')
plt.xlabel('Attack')
plt.ylabel('Speed')
plt.tight_layout()
plt.show()

# 数値列のみに限定して相関係数を計算 / Limit to numeric columns for correlation
corr = df.select_dtypes(include=[np.number]).corr()
print("Correlation Matrix:\n", corr)


# === 8. 確率計算 / Probability Calculation ===
# 条件付き確率(例:攻撃>80かつHP>100の確率)
prob_all = len(df)
prob_attack_hp = len(df[(df['Attack'] > 80) & (df['HP'] > 100)]) / prob_all
print(f"Conditional Probability (Attack > 80 and HP > 100): {prob_attack_hp:.2f}")

# Program Name: statistics_fundamentals_all_in_one.py
# Creation Date: 20250420
# Overview: 統計学の基本トピックを体系的に網羅し、Pythonで一括実行できる学習・可視化ツール
# Usage: To run the program, use the command `python statistics_fundamentals_all_in_one.py` in the terminal

# --- 必要なライブラリのインストール / Install required libraries ---
# !pip install pandas matplotlib seaborn numpy scipy

# --- ライブラリのインポート / Import required libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from math import factorial

# --- データ定義 / Sample data definition ---
np.random.seed(0)
data = np.random.normal(loc=50, scale=10, size=100)  # 平均50、標準偏差10の正規分布 / Normal distribution

# === 2. 度数分布とヒストグラム / Frequency Distribution and Histogram ===
plt.hist(data, bins=10, edgecolor='black')
plt.title('Histogram of Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()

cum_counts, bin_edges = np.histogram(data, bins=10)
cum_dist = np.cumsum(cum_counts)
print("Cumulative Frequency Distribution:", cum_dist)

# ローレンツ曲線とジニ係数 / Lorenz Curve and Gini Coefficient
sorted_data = np.sort(data)
cumulative = np.cumsum(sorted_data) / np.sum(sorted_data)
lorenz_curve = np.insert(cumulative, 0, 0)
x = np.linspace(0.0, 1.0, len(lorenz_curve))
plt.plot(x, lorenz_curve, label='Lorenz Curve')
plt.plot(x, x, '--', label='Line of Equality')
plt.title('Lorenz Curve')
plt.xlabel('Cumulative Share of People')
plt.ylabel('Cumulative Share of Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

gini = 1 - 2 * np.trapz(lorenz_curve, x)
print(f"Gini Coefficient: {gini:.3f}")

# === 3. 代表値と分布形状 / Central Tendency and Shape ===
mean_val = np.mean(data)
median_val = np.median(data)
mode_val = pd.Series(data).mode()[0]
skew_val = skew(data)
kurt_val = kurtosis(data)

print(f"Mean: {mean_val:.2f}, Median: {median_val:.2f}, Mode: {mode_val:.2f}")
print(f"Skewness: {skew_val:.2f}, Kurtosis: {kurt_val:.2f}")

sns.boxplot(data=data)
plt.title("Boxplot of Sample Data")
plt.tight_layout()
plt.show()

# === 6. 分散と標準偏差 / Variance and Std Dev ===
var = np.var(data, ddof=1)
std = np.std(data, ddof=1)
cv = std / mean_val
print(f"Variance: {var:.2f}, Std Dev: {std:.2f}, Coefficient of Variation: {cv:.2f}")

# === 7. 場合の数 / Counting ===
n, r = 5, 3
perm = factorial(n) // factorial(n - r)  # 順列 / Permutation
comb = factorial(n) // (factorial(r) * factorial(n - r))  # 組合せ / Combination
print(f"P({n},{r}) = {perm}, C({n},{r}) = {comb}")

# === 9. 期待値 / Expected Value ===
dice = np.array([1, 2, 3, 4, 5, 6])
probs = np.full(6, 1/6)
expectation = np.sum(dice * probs)
print(f"Expected value of a fair dice roll: {expectation:.2f}")

# === 10. ベイズの定理 / Bayes' Theorem ===
P_disease = 0.01
P_pos_given_disease = 0.99
P_pos_given_healthy = 0.05
P_healthy = 1 - P_disease
P_positive = (P_pos_given_disease * P_disease) + (P_pos_given_healthy * P_healthy)
P_disease_given_pos = (P_pos_given_disease * P_disease) / P_positive
print(f"Bayes Theorem: P(Disease | Positive) = {P_disease_given_pos:.3f}")

# Program Name: pokemon_probability_distribution_full.py
# Creation Date: 20250420
# Overview: ポケモンの種族値データを使って、離散・連続確率分布、期待値、分散、大数の法則、中心極限定理を一括で学べる統計学統合プログラム
# Usage: python pokemon_probability_distribution_full.py

# --- ライブラリのインポート ---

# --- ポケモン種族値データをさらに拡張 / Further Expanded Pokémon base stats ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55,
        60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80,
        79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55,
        60, 95, 60, 105, 55, 75
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55,
        65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82,
        83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130,
        50, 95, 80, 130, 65, 100
    ]
}
df = pd.DataFrame(pokemon)
df



# === 11. 離散型・連続型確率分布 ===
mu_attack = np.mean(df['Attack'])
x_poisson = np.arange(0, 200, 1)
pmf_poisson = poisson.pmf(x_poisson, mu_attack)
plt.plot(x_poisson, pmf_poisson, 'o-', label=f'Poisson(μ={mu_attack:.1f})')
plt.title("Poisson Distribution (Attack)")
plt.xlabel("Attack Value")
plt.ylabel("Probability")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# 正規分布(HP)
mu_hp = np.mean(df['HP'])
sigma_hp = np.std(df['HP'], ddof=1)
x_normal = np.linspace(0, 200, 300)
pdf_normal = norm.pdf(x_normal, mu_hp, sigma_hp)
plt.plot(x_normal, pdf_normal, label=f'N({mu_hp:.1f}, {sigma_hp:.1f}²)')
plt.title("Normal Distribution of HP")
plt.xlabel("HP")
plt.ylabel("Density")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# === 12. CDFと期待値・分散 ===
cdf_normal = norm.cdf(x_normal, mu_hp, sigma_hp)
plt.plot(x_normal, cdf_normal)
plt.title("Cumulative Distribution Function (HP)")
plt.xlabel("HP")
plt.ylabel("Cumulative Probability")
plt.grid(True)
plt.tight_layout()
plt.show()

expected_hp = np.mean(df['HP'])
variance_hp = np.var(df['HP'], ddof=1)
print(f"Expected HP: {expected_hp:.2f}, Variance: {variance_hp:.2f}")

# === 13. 離散分布:二項・幾何 ===
x_binom = np.arange(11)
pmf_binom = binom.pmf(x_binom, n=10, p=0.3)
plt.bar(x_binom, pmf_binom)
plt.title("Binomial Distribution (n=10, p=0.3)")
plt.xlabel("Successes")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()

x_geom = np.arange(1, 11)
pmf_geom = geom.pmf(x_geom, p=0.2)
plt.bar(x_geom, pmf_geom)
plt.title("Geometric Distribution (p=0.2)")
plt.xlabel("Trial")
plt.ylabel("Probability")
plt.tight_layout()
plt.show()

# === 14. 標準正規分布とZ変換 ===
df['HP_zscore'] = (df['HP'] - mu_hp) / sigma_hp
print("Standardized HP (Z-scores):\n", df[['Name', 'HP', 'HP_zscore']])

x_std = np.linspace(-4, 4, 300)
plt.plot(x_std, norm.pdf(x_std), label='Standard Normal')
plt.title("Standard Normal Distribution")
plt.xlabel("Z")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()

# === 15. 指数・一様分布 ===
x_exp = np.linspace(0, 10, 300)
plt.plot(x_exp, expon.pdf(x_exp, scale=1), label='Exponential(λ=1)')
plt.title("Exponential Distribution")
plt.xlabel("Time")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()

x_uni = np.linspace(0, 100, 300)
plt.plot(x_uni, uniform.pdf(x_uni, loc=0, scale=100), label='Uniform(0,100)')
plt.title("Uniform Distribution")
plt.xlabel("Value")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()

# === 17. 大数の法則と中心極限定理 ===
samples = np.random.choice(df['Attack'], size=(1000, 100), replace=True)
sample_means = samples.mean(axis=1)
plt.plot(sample_means)
plt.axhline(np.mean(df['Attack']), color='red', linestyle='--', label='True Mean')
plt.title("Law of Large Numbers Simulation")
plt.xlabel("Sample Index")
plt.ylabel("Sample Mean")
plt.legend()
plt.tight_layout()
plt.show()

sample_means_dist = np.random.choice(df['Attack'], size=(1000, 30)).mean(axis=1)
sns.histplot(sample_means_dist, kde=True, stat='density')
plt.title("Central Limit Theorem (n=30)")
plt.xlabel("Sample Mean")
plt.tight_layout()
plt.show()

# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, t, chi2

# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75
    ]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
n = len(hp_sample)

# === 18. 点推定 / Point Estimation ===
mean_hp = np.mean(hp_sample)
var_hp = np.var(hp_sample, ddof=1)  # 不偏分散
std_hp = np.std(hp_sample, ddof=1)  # 標準偏差
se_hp = std_hp / np.sqrt(n)         # 標準誤差

print("[18] Point Estimation")
print(f"Sample Size n = {n}")
print(f"Sample Mean (\u03bĉ): {mean_hp:.2f}")
print(f"Unbiased Variance (s^2): {var_hp:.2f}")
print(f"Standard Deviation (s): {std_hp:.2f}")
print(f"Standard Error (SE): {se_hp:.2f}\n")

# === 19. 区間推定(母分散既知) / Confidence Interval with Known Variance ===
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
print("[19] Confidence Interval (\u03c3 known)")
print(f"95% CI: {ci_known}\n")

# === 20. 区間推定(母分散未知) / Confidence Interval with Unknown Variance ===
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
print("[20] Confidence Interval (\u03c3 unknown)")
print(f"95% CI: {ci_unknown}\n")

# === 21. 母比率の区間推定 / Population Proportion ===
success = np.sum(hp_sample >= 80)
p_hat = success / n
se_p = np.sqrt(p_hat * (1 - p_hat) / n)
ci_prop = (p_hat - z * se_p, p_hat + z * se_p)
print("[21] Confidence Interval for Proportion (P(HP >= 80))")
print(f"Proportion: {p_hat:.2f}, 95% CI: {ci_prop}\n")

# === 22. 母分散の区間推定 / Variance Estimation ===
chi2_lower = chi2.ppf(0.025, df=n-1)
chi2_upper = chi2.ppf(0.975, df=n-1)
ci_var = ((n - 1) * var_hp / chi2_upper, (n - 1) * var_hp / chi2_lower)
print("[22] Confidence Interval for Population Variance")
print(f"95% CI for Variance: {ci_var}\n")

# --- プロット: 信頼区間の視覚化 / Visualizing Confidence Intervals ---
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()

# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr

# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)

# === 18. 点推定 ===
mean_hp = np.mean(hp_sample)
var_hp = np.var(hp_sample, ddof=1)
std_hp = np.std(hp_sample, ddof=1)
se_hp = std_hp / np.sqrt(n)

print("[18] Point Estimation")
print(f"Sample Size n = {n}")
print(f"Sample Mean (μ̂): {mean_hp:.2f}")
print(f"Unbiased Variance (s^2): {var_hp:.2f}")
print(f"Standard Deviation (s): {std_hp:.2f}")
print(f"Standard Error (SE): {se_hp:.2f}\n")

plt.figure()
plt.hist(hp_sample, bins=10, edgecolor='black')
plt.title("Histogram of HP")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# === 19. 区間推定(母分散既知) ===
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
print("[19] Confidence Interval (σ known)")
print(f"95% CI: {ci_known}\n")

# === 20. 区間推定(母分散未知) ===
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
print("[20] Confidence Interval (σ unknown)")
print(f"95% CI: {ci_unknown}\n")

# === 21. 母比率の区間推定 ===
success = np.sum(hp_sample >= 80)
p_hat = success / n
se_p = np.sqrt(p_hat * (1 - p_hat) / n)
ci_prop = (p_hat - z * se_p, p_hat + z * se_p)
print("[21] Confidence Interval for Proportion (P(HP >= 80))")
print(f"Proportion: {p_hat:.2f}, 95% CI: {ci_prop}\n")

plt.figure()
plt.bar(['HP >= 80', 'HP < 80'], [success, n - success], color=['skyblue', 'lightcoral'])
plt.title("HP ≥ 80 vs HP < 80")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# === 22. 母分散の区間推定 ===
chi2_lower = chi2.ppf(0.025, df=n-1)
chi2_upper = chi2.ppf(0.975, df=n-1)
ci_var = ((n - 1) * var_hp / chi2_upper, (n - 1) * var_hp / chi2_lower)
print("[22] Confidence Interval for Population Variance")
print(f"95% CI for Variance: {ci_var}\n")

# === 23. 検定の基礎と用語 ===
# → 検定とは:仮説(H0, H1)に基づき、サンプルから統計的判断を行う方法。
# → 有意水準α、検出力(1-β)、第1種の過誤(α)、第2種の過誤(β)、両側/片側検定 などを含む。

# === 24. 平均値の検定(t検定) ===
mu0 = 80
print("[24] One-sample t-test against μ0 = 80")
t_stat, p_val = ttest_1samp(hp_sample, mu0)
print(f"t-statistic = {t_stat:.3f}, p-value (two-tailed) = {p_val:.4f}\n")

plt.figure()
plt.hist(hp_sample, bins=10, color='lightgreen', edgecolor='black')
plt.axvline(mu0, color='red', linestyle='--', label='μ₀ = 80')
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.title("One-sample t-test Visualization")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

# === 26. 相関分析 ===
print("[26] Correlation Analysis between HP and Attack")
correlation, p_corr = pearsonr(hp_sample, attack_sample)
print(f"Pearson r = {correlation:.3f}, p-value = {p_corr:.4f}\n")

plt.figure()
plt.scatter(hp_sample, attack_sample, alpha=0.7)
plt.title("Scatter Plot: HP vs Attack")
plt.xlabel("HP")
plt.ylabel("Attack")
plt.grid(True)
plt.tight_layout()
plt.show()

# === Visualization: Confidence Interval Summary ===
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)

# === 27. 単回帰分析 / Simple Linear Regression ===
print("[27] Simple Linear Regression: Predict Attack from HP")
X = np.array(hp_sample).reshape(-1, 1)
y = np.array(attack_sample)
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)
residuals = y - y_pred

print(f"Regression Equation: Attack = {model.coef_[0]:.2f} * HP + {model.intercept_:.2f}")
print(f"R^2 (coefficient of determination): {r2:.3f}")
print(f"MSE (mean squared error): {mse:.2f}")
print(f"Mean of Residuals: {np.mean(residuals):.2f}\n")

plt.figure()
sns.regplot(x=hp_sample, y=attack_sample, ci=None, line_kws={'color': 'red'})
plt.title("Simple Linear Regression: HP → Attack")
plt.xlabel("HP")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()

plt.figure()
plt.scatter(hp_sample, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("Residual Plot")
plt.xlabel("HP")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()

# === 28. 等分散性検定とWelchのt検定 ===
print("[28] Welch's t-test (HP >= 80 vs HP < 80)")
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
print(f"Welch's t-statistic: {t_stat:.3f}, p-value: {p_val:.4f}")
print(f"Group 1 Mean Attack (HP ≥ 80): {group1.mean():.2f}")
print(f"Group 2 Mean Attack (HP < 80): {group2.mean():.2f}\n")

plt.figure()
sns.boxplot(data=[group1, group2], notch=True)
plt.xticks([0, 1], ["HP ≥ 80", "HP < 80"])
plt.title("Attack Comparison by HP Group (Welch's t-test)")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()

# === 29. 一元配置分散分析 / One-Way ANOVA ===
print("[29] One-Way ANOVA on Attack grouped by HP Quantiles")
df['HP_Q'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
groups = [df[df['HP_Q'] == q]['Attack'] for q in ['Low', 'Mid', 'High']]
F_stat, p_val = f_oneway(*groups)
print(f"ANOVA F-statistic: {F_stat:.3f}, p-value: {p_val:.4f}")
print("Group Means:")
for q in ['Low', 'Mid', 'High']:
    mean_attack = df[df['HP_Q'] == q]['Attack'].mean()
    print(f"  {q} HP Group Mean Attack: {mean_attack:.2f}")
print()

plt.figure()
sns.boxplot(x='HP_Q', y='Attack', data=df, order=['Low', 'Mid', 'High'])
plt.title("One-Way ANOVA: Attack by HP Quantile")
plt.xlabel("HP Group")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
attack_sample = df['Attack']
n = len(hp_sample)

# === 28. 等分散性検定とWelchのt検定 + F検定と信頼区間 ===
print("[28] F-test for Equal Variances")
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
var1 = np.var(group1, ddof=1)
var2 = np.var(group2, ddof=1)
f_stat = var1 / var2 if var1 > var2 else var2 / var1

# 自由度とF分布の下限・上限臨界値
n1, n2 = len(group1), len(group2)
df1, df2 = n1 - 1, n2 - 1
f_crit_low = f.ppf(0.025, df1, df2)
f_crit_high = f.ppf(0.975, df1, df2)

# 母分散比の信頼区間計算
ci_f_lower = var1 / var2 / f_crit_high
ci_f_upper = var1 / var2 / f_crit_low

print(f"F-statistic (larger variance / smaller variance): {f_stat:.3f}")
print(f"95% Confidence Interval for Variance Ratio (σ1² / σ2²): ({ci_f_lower:.3f}, {ci_f_upper:.3f})")
print(f"Variance Group1 (HP ≥ 80): {var1:.2f}, n = {n1}")
print(f"Variance Group2 (HP < 80): {var2:.2f}, n = {n2}")
print("Interpretation: If CI includes 1, we cannot reject equal variances.\n")

# Welch's t-test(等分散を仮定しない2群平均差の検定)
print("[28] Welch's t-test (HP >= 80 vs HP < 80)")
t_stat, p_val = ttest_ind(group1, group2, equal_var=False)
print(f"Welch's t-statistic: {t_stat:.3f}, p-value: {p_val:.4f}")
print(f"Group 1 Mean Attack (HP ≥ 80): {group1.mean():.2f}")
print(f"Group 2 Mean Attack (HP < 80): {group2.mean():.2f}")
print("Interpretation: If p < 0.05, the group means differ significantly.\n")

# 可視化(ボックスプロット)
plt.figure()
sns.boxplot(data=[group1, group2], notch=True)
plt.xticks([0, 1], ["HP ≥ 80", "HP < 80"])
plt.title("Attack Comparison by HP Group (Welch's t-test)")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()

# === 29. 一元配置分散分析 / One-Way ANOVA ===
print("[29] One-Way ANOVA on Attack grouped by HP Quantiles")
df['HP_Q'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
groups = [df[df['HP_Q'] == q]['Attack'] for q in ['Low', 'Mid', 'High']]
F_stat, p_val = f_oneway(*groups)
print(f"ANOVA F-statistic: {F_stat:.3f}, p-value: {p_val:.4f}")
print("Group Means (Attack):")
for q in ['Low', 'Mid', 'High']:
    mean_attack = df[df['HP_Q'] == q]['Attack'].mean()
    print(f"  {q} HP Group: {mean_attack:.2f}")
print("Interpretation: If p < 0.05, at least one group mean differs.\n")

# 可視化(HP三分位によるAttackのBoxplot)
plt.figure()
sns.boxplot(x='HP_Q', y='Attack', data=df, order=['Low', 'Mid', 'High'])
plt.title("One-Way ANOVA: Attack by HP Quantile")
plt.xlabel("HP Group")
plt.ylabel("Attack")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f, fisher_exact

# --- Step 0: データ定義 / Pokémon Base Stats (拡張) ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ]
}
df = pd.DataFrame(pokemon)

# === 1. 2x2のクロス集計表とオッズ比 ===
# 仮の条件: HP >= 80 を "高HP"、Attack >= 100 を "高Attack" と定義
high_hp = df['HP'] >= 80
high_attack = df['Attack'] >= 100

# クロス集計表作成 / Contingency table
contingency = pd.crosstab(high_hp, high_attack)
print("[1] 2x2 Contingency Table (High HP vs High Attack):")
print(contingency, "\n")

# オッズ比の計算 / Odds Ratio
oddsratio, p_fisher = fisher_exact(contingency)
print(f"Odds Ratio: {oddsratio:.3f}")
print(f"Fisher's Exact Test p-value: {p_fisher:.4f}")
print("Interpretation: If OR > 1, high HP is positively associated with high Attack.\n")

# 可視化:クロス集計のヒートマップ
sns.heatmap(contingency, annot=True, fmt='d', cmap='Blues')
plt.title("2x2 Contingency Table: High HP vs High Attack")
plt.xlabel("High Attack")
plt.ylabel("High HP")
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, pearsonr, f_oneway, f, fisher_exact, binom, poisson, geom, multinomial, chisquare, spearmanr, mannwhitneyu, wilcoxon, kruskal, gamma

# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ],
    'Defense': [
        40, 78, 65, 49, 90, 50, 60, 80, 80, 45, 160, 20,
        79, 95, 80, 53, 80, 83, 100, 55, 77, 73, 75, 110,
        95, 70, 75, 180, 70, 115, 70, 85, 110, 120, 95, 95,
        75, 85, 95, 70, 70, 55
    ],
    'Speed': [
        90, 100, 30, 45, 130, 55, 110, 55, 60, 120, 70, 20,
        81, 80, 105, 87, 95, 80, 78, 110, 85, 60, 100, 30,
        70, 100, 50, 70, 67, 75, 150, 55, 45, 40, 85, 110,
        91, 50, 70, 70, 105, 60
    ]
}
df = pd.DataFrame(pokemon)

# --- プロット: HP vs Attack ---
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='HP', y='Attack')
plt.title('HP vs Attack of Pokémon')
plt.xlabel('HP')
plt.ylabel('Attack')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- プロット: HP Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['HP'], bins=10, kde=True)
plt.title('Distribution of HP')
plt.xlabel('HP')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# --- プロット: Attack Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Attack'], bins=10, kde=True)
plt.title('Distribution of Attack')
plt.xlabel('Attack')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# --- プロット: Defense vs Speed ---
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Defense', y='Speed')
plt.title('Defense vs Speed of Pokémon')
plt.xlabel('Defense')
plt.ylabel('Speed')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- プロット: Defense Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Defense'], bins=10, kde=True)
plt.title('Distribution of Defense')
plt.xlabel('Defense')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# --- プロット: Speed Distribution ---
plt.figure(figsize=(8, 4))
sns.histplot(df['Speed'], bins=10, kde=True)
plt.title('Distribution of Speed')
plt.xlabel('Speed')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定・多変量解析(ロジスティック回帰・クラスタリング)を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage

# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
    'Name': [
        'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
        'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
        'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
        'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
        'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
        'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros',
        'Pidgeot', 'Vileplume', 'Poliwrath', 'Dewgong', 'Rapidash', 'Farfetch’d'
    ],
    'HP': [
        35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
        95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
        50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75,
        83, 75, 90, 90, 65, 52
    ],
    'Attack': [
        55, 84, 110, 49, 110, 55, 65, 130, 85, 50, 45, 45,
        125, 134, 110, 120, 110, 82, 83, 90, 92, 70, 76, 75,
        60, 110, 105, 95, 73, 130, 50, 95, 80, 130, 65, 100,
        80, 80, 85, 70, 100, 65
    ],
    'Defense': [
        40, 78, 65, 49, 90, 50, 60, 80, 80, 45, 160, 20,
        79, 95, 80, 53, 80, 83, 100, 55, 77, 73, 75, 110,
        95, 70, 75, 180, 70, 115, 70, 85, 110, 120, 95, 95,
        75, 85, 95, 70, 70, 55
    ],
    'Speed': [
        90, 100, 30, 45, 130, 55, 110, 55, 60, 120, 70, 20,
        81, 80, 105, 87, 95, 80, 78, 110, 85, 60, 100, 30,
        70, 100, 50, 70, 67, 75, 150, 55, 45, 40, 85, 110,
        91, 50, 70, 70, 105, 60
    ]
}
df = pd.DataFrame(pokemon)

# === 7-1. ロジスティック回帰分析(2クラス分類) ===
# 条件: Attack >= 100 を 1(高火力)とする
X_logit = df[['HP', 'Defense', 'Speed']]
y_logit = (df['Attack'] >= 100).astype(int)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_logit)
log_reg = LogisticRegression()
log_reg.fit(X_scaled, y_logit)
print("\n[7-1] Logistic Regression Coefficients:")
for col, coef in zip(X_logit.columns, log_reg.coef_[0]):
    print(f"  {col}: {coef:.3f}")

# === 7-4. 階層型クラスタリング(ヒートマップ+デンドログラム) ===
features = ['HP', 'Attack', 'Defense', 'Speed']
X_cluster = df[features]
X_scaled = StandardScaler().fit_transform(X_cluster)
linkage_matrix = linkage(X_scaled, method='ward')
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix, labels=df['Name'].values, leaf_rotation=90)
plt.title("[7-4] Hierarchical Clustering Dendrogram")
plt.xlabel("Pokémon")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

# === 7-6. 非階層型クラスタリング(KMeans) ===
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
df['KMeansCluster'] = kmeans_labels

# PCA for 2D visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
df['PCA1'] = pca_result[:, 0]
df['PCA2'] = pca_result[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='KMeansCluster', palette='Set2')
plt.title('[7-6] KMeans Clustering with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・検定・相関分析・回帰分析・分散分析・オッズ比解析・確率分布・ノンパラメトリック検定・多変量解析(ロジスティック回帰・クラスタリング)を詳しく学ぶ。
# Usage: Run with `python pokemon_population_estimation.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import ttest_ind, chi2_contingency, linregress, f_oneway

# --- Step 0: データ定義 / Pokémon Base Stats ---
# 省略(すでに記載済)...

# === 1-1. 箱ひげ図 ===
plt.figure()
sns.boxplot(data=df[['HP', 'Attack', 'Defense', 'Speed']])
plt.title("[1-1] Boxplot of Base Stats")
plt.tight_layout()
plt.show()

# === 1-2. 相関係数と散布図 ===
sns.pairplot(df[['HP', 'Attack', 'Defense', 'Speed']])
plt.suptitle("[1-2] Scatter Matrix of Stats", y=1.02)
plt.show()

# === 1-3. ヒストグラムと基本統計量 ===
print("\n[1-3] Descriptive Statistics")
print(df[['HP', 'Attack', 'Defense', 'Speed']].describe())
df[['HP', 'Attack', 'Defense', 'Speed']].hist(bins=10, figsize=(10, 6))
plt.suptitle("[1-3] Histograms of Base Stats")
plt.tight_layout()
plt.show()

# === 1-4. 積み上げ棒グラフ ===
stat_stack = df[['HP', 'Attack', 'Defense', 'Speed']].head(10)
stat_stack.index = df['Name'][:10]
stat_stack.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title("[1-4] Stacked Bar Graph of Stats (Top 10 Pokémon)")
plt.ylabel("Stat Total")
plt.tight_layout()
plt.show()

# === 1-5. 折れ線グラフ(種族値の推移例) ===
df_plot = df[['Name', 'HP', 'Attack', 'Defense', 'Speed']].set_index('Name').head(10).T
plt.figure(figsize=(10, 5))
for name in df_plot.columns:
    plt.plot(df_plot.index, df_plot[name], marker='o', label=name)
plt.title("[1-5] Line Plot of Stats (Top 10 Pokémon)")
plt.xlabel("Stat Type")
plt.ylabel("Value")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# === 2-1. 対応のない2標本t検定 ===
group1 = df[df['HP'] >= 80]['Attack']
group2 = df[df['HP'] < 80]['Attack']
t_stat, p_val = ttest_ind(group1, group2)
print(f"\n[2-1] Two-sample t-test (Attack by HP Group): t = {t_stat:.3f}, p = {p_val:.4f}")

# === 2-2. 独立性の検定 ===
df['HighSpeed'] = (df['Speed'] > 90)
df['HighAttack'] = (df['Attack'] > 100)
crosstab = pd.crosstab(df['HighSpeed'], df['HighAttack'])
chi2, p, dof, expected = chi2_contingency(crosstab)
print(f"\n[2-2] Chi-square Test of Independence: chi2 = {chi2:.3f}, p = {p:.4f}")

# === 2-3. 単回帰分析 ===
slope, intercept, r_value, p_value, std_err = linregress(df['HP'], df['Attack'])
print(f"\n[2-3] Simple Linear Regression: Attack ~ HP")
print(f"  Slope = {slope:.3f}, Intercept = {intercept:.3f}, R² = {r_value**2:.3f}, p = {p_value:.4f}")

# === 2-5. 一元配置分散分析 ===
df['HP_Group'] = pd.qcut(df['HP'], 3, labels=['Low', 'Mid', 'High'])
g1 = df[df['HP_Group'] == 'Low']['Attack']
g2 = df[df['HP_Group'] == 'Mid']['Attack']
g3 = df[df['HP_Group'] == 'High']['Attack']
F_stat, p_val = f_oneway(g1, g2, g3)
print(f"\n[2-5] One-Way ANOVA: F = {F_stat:.3f}, p = {p_val:.4f}")
# Program Name: pokemon_stat_analysis_integrated.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学におけるデータの整理、推定・検定、多変量解析、クラスタリングまで幅広く網羅する統合分析プログラム。
# Usage: Run with `python pokemon_stat_analysis_integrated.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm, linregress, f_oneway, percentileofscore, zscore

# --- ポケモン種族値の定義 ---
pokemon = {
    'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Mewtwo', 'Bulbasaur'],
    'HP': [35, 78, 160, 106, 45],
    'Attack': [55, 84, 110, 110, 49],
    'Defense': [40, 78, 65, 90, 49],
    'Speed': [90, 100, 30, 130, 45]
}
df = pd.DataFrame(pokemon)

# --- 2-2: 基本統計とヒストグラム ---
print("基本統計量 / Descriptive Statistics")
print(df.describe())

stats = ['HP', 'Attack', 'Defense', 'Speed']
df[stats].hist(bins=10, figsize=(10, 6))
plt.suptitle("Histogram of Pokémon Base Stats")
plt.tight_layout()
plt.show()

# --- 散布図と相関 ---
print("\n相関係数行列 / Correlation Matrix")
print(df[stats].corr())
sns.pairplot(df[stats])
plt.suptitle("Scatter Matrix of Pokémon Base Stats", y=1.02)
plt.show()

# --- 箱ひげ図 ---
plt.figure()
sns.boxplot(data=df[stats])
plt.title("Boxplot of Pokémon Base Stats")
plt.tight_layout()
plt.show()

# --- 2-1: t検定(例:HPが80以上と未満のAttack差) ---
df['HP_Group'] = ['High' if hp >= 80 else 'Low' for hp in df['HP']]
group1 = df[df['HP_Group'] == 'High']['Attack']
group2 = df[df['HP_Group'] == 'Low']['Attack']
t_stat, p_val = ttest_ind(group1, group2)
print(f"\n[2-1] t検定(HP高低でのAttack差): t = {t_stat:.3f}, p = {p_val:.4f}")

# --- 2-1: z検定(母分散既知を仮定したHP平均の検定) ---
mu_hp = 100
sigma_hp = 30
xbar_hp = df['HP'].mean()
n_hp = len(df['HP'])
z = (xbar_hp - mu_hp) / (sigma_hp / np.sqrt(n_hp))
p_z = 2 * (1 - norm.cdf(abs(z)))
print(f"[2-1] z検定(HPの母平均100との比較): z = {z:.3f}, p = {p_z:.4f}")

# --- 2-3: 回帰分析(Attack ~ HP) ---
slope, intercept, r_value, p_value, std_err = linregress(df['HP'], df['Attack'])
print(f"\n[2-3] 回帰分析 Attack ~ HP: slope = {slope:.3f}, intercept = {intercept:.3f}, R² = {r_value**2:.3f}, p = {p_value:.4f}")

# --- 2-4: 一元配置分散分析(HP三分位によるAttack差) ---
df['HP_Tile'] = pd.qcut(df['HP'], q=3, labels=['Low', 'Mid', 'High'])
low = df[df['HP_Tile'] == 'Low']['Attack']
mid = df[df['HP_Tile'] == 'Mid']['Attack']
high = df[df['HP_Tile'] == 'High']['Attack']
F, p_anova = f_oneway(low, mid, high)
print(f"\n[2-4] 一元配置分散分析: F = {F:.3f}, p = {p_anova:.4f}")

# --- 2-5: ランダムサンプリング・順位・百分位数・標準化 ---
sample_df = df.sample(n=3, random_state=1)
print("\n[2-5] ランダムサンプリング:")
print(sample_df[['Name', 'HP', 'Attack']])

print("\n[2-5] HPの百分位数:")
df['HP_percentile'] = df['HP'].apply(lambda x: percentileofscore(df['HP'], x))
print(df[['Name', 'HP', 'HP_percentile']])

print("\n[2-5] Zスコア標準化:")
for col in stats:
    df[f'{col}_z'] = zscore(df[col])
print(df[['Name'] + [f'{col}_z' for col in stats]])
# Program Name: pokemon_stat_analysis_integrated.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学におけるデータの整理、推定・検定、多変量解析、クラスタリング、数学的基礎(シグマ記号・微分・積分・指数・対数・不偏分散・自由度)まで幅広く網羅する統合分析プログラム。
# Usage: Run with `python pokemon_stat_analysis_integrated.py`

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, norm, linregress, f_oneway, percentileofscore, zscore

# --- 数学的補足 / Mathematical Supplement ---
# シグマ(Σ): 合計演算
print("\n[数学補足] Σ記号による攻撃値の合計: ΣAttack =", np.sum(df['Attack']))

# 微分の基本(例: x² の微分)
from sympy import symbols, diff, integrate, log
x = symbols('x')
expr = x**2
derivative = diff(expr, x)
print("[数学補足] 微分 d/dx(x²) =", derivative)

# 積分の基本(例: x の積分)
integral = integrate(x, x)
print("[数学補足] 積分 ∫x dx =", integral)

# 指数と対数
print("[数学補足] 指数 e^2 =", np.exp(2))
print("[数学補足] 自然対数 log(10) =", np.log(10))
print("[数学補足] 常用対数 log10(1000) =", np.log10(1000))

# 標本分散と不偏分散
var_n = np.var(df['HP'], ddof=0)  # 母分散
var_n1 = np.var(df['HP'], ddof=1)  # 不偏分散
print("[数学補足] 標本分散(ddof=0)=", round(var_n, 3))
print("[数学補足] 不偏分散(ddof=1)=", round(var_n1, 3))

# 自由度(自由に変えられるデータ数)
n = len(df['HP'])
print("[数学補足] 自由度(n-1)=", n - 1)

# --- ポケモン種族値の定義 ---
pokemon = {
    'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Mewtwo', 'Bulbasaur'],
    'HP': [35, 78, 160, 106, 45],
    'Attack': [55, 84, 110, 110, 49],
    'Defense': [40, 78, 65, 90, 49],
    'Speed': [90, 100, 30, 130, 45]
}
df = pd.DataFrame(pokemon)

# (以下、省略:既存の統計処理と可視化コードはそのまま)

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?