ポケモンを題材にした統計学の学習カリキュラム案を作成しました。こちらは初期案(たたき台)としてご確認ください。
参考
# プログラム名: pokemon_data_summary_full.py
# Program Name: pokemon_data_summary_full.py
# 概要: ポケモン第1世代のデータを用いて統計分析(第1章)を実施
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew, kurtosis
# --- ポケモン第1世代(No.001〜151)データの読み込み / Load Gen 1 Pokémon data ---
data = {
'No': [1, 2, 3, 6, 25, 26, 143, 149, 150],
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Sp. Atk': [65, 80, 100, 109, 50, 90, 65, 100, 154],
'Sp. Def': [65, 80, 100, 85, 50, 80, 110, 100, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130],
'Total': [318, 405, 525, 534, 320, 485, 540, 600, 680]
}
df = pd.DataFrame(data)
# --- 1-4-1 モードとメディアン / Mode and Median ---
mode_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].mode().iloc[0]
median_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].median()
# --- 1-4-2 平均 / Mean ---
mean_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].mean()
# --- 1-5-1 範囲と四分位点 / Range and Quartiles ---
range_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].max() - df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].min()
quartiles = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].quantile([0.25, 0.5, 0.75])
# --- 1-5-2 平均偏差と標準偏差 / Mean Absolute Deviation and Standard Deviation ---
mad_vals = (df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']] - mean_vals).abs().mean()
std_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].std()
# --- 1-6 変動係数・歪度・尖度 / CV, Skewness, Kurtosis ---
cv_vals = std_vals / mean_vals
skew_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].apply(skew)
kurt_vals = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].apply(kurtosis)
# --- 1-7-2 相関図と散布図 / Correlation Matrix and Pairplot ---
correlation_matrix = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total']].corr()
sns.pairplot(df[['Attack', 'Defense', 'Sp. Atk', 'Speed']])
plt.suptitle('Gen 1 Pokémon - Stats Scatter Matrix', y=1.02)
plt.show()
# --- 1-8 単回帰分析(攻撃→合計) / Simple Linear Regression ---
X = df[['Attack']]
y = df['Total']
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
plt.figure(figsize=(8, 5))
plt.scatter(df['Attack'], df['Total'], color='blue', label='Data')
plt.plot(df['Attack'], y_pred, color='red', label='Regression Line')
plt.title('Attack vs Total - Regression Line')
plt.xlabel('Attack')
plt.ylabel('Total')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 結果出力 / Summary output ---
summary = {
"Mean": mean_vals.to_dict(),
"Median": median_vals.to_dict(),
"Mode": mode_vals.to_dict(),
"Range": range_vals.to_dict(),
"Quartiles": quartiles.to_dict(),
"Standard Deviation": std_vals.to_dict(),
"Mean Absolute Deviation": mad_vals.to_dict(),
"Coefficient of Variation": cv_vals.to_dict(),
"Skewness": skew_vals.to_dict(),
"Kurtosis": kurt_vals.to_dict(),
"Correlation Matrix": correlation_matrix.to_dict(),
"Regression Coefficient": model.coef_[0],
"Intercept": model.intercept_
}
summary
# プログラム名: pokemon_probability_chapter2.py
# Program Name: pokemon_probability_chapter2.py
# 概要: ポケモンデータを用いて確率論の基礎(第2章)を実装 + 可視化 + モンテカルロ + ベイズ推定
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from scipy.stats import beta
# --- データ定義 / Define simplified Pokémon dataset ---
data = {
'Name': ['ピカチュウ', 'リザードン', 'カビゴン', 'フシギダネ', 'ミュウツー'],
'Attack': [55, 84, 110, 49, 110],
'Speed': [90, 100, 30, 45, 130],
'Legendary': [False, False, False, False, True]
}
df = pd.DataFrame(data)
# --- 2-1: 標本空間と事象 / Sample Space and Events ---
sample_space = set(df['Name'])
high_hp_event = set(df[df['Attack'] >= 100]['Name'])
# --- 2-2: 確率の定義と性質 / Classical Probability ---
P_high_attack = len(high_hp_event) / len(sample_space)
# --- 2-3: 条件付き確率 / Conditional Probability ---
high_speed = df[df['Speed'] > 90]
joint = df[(df['Speed'] > 90) & (df['Attack'] >= 100)]
P_B = len(high_speed) / len(df)
P_A_and_B = len(joint) / len(df)
P_A_given_B = P_A_and_B / P_B if P_B != 0 else 0
# --- 2-4: 独立性 / Independence ---
P_A = P_high_attack
independent = np.isclose(P_A_and_B, P_A * P_B)
# --- 2-5: ベイズの定理 / Bayes Theorem ---
P_Legendary = len(df[df['Legendary']]) / len(df)
P_HighAttack_given_Legendary = len(df[(df['Legendary']) & (df['Attack'] >= 100)]) / len(df[df['Legendary']])
P_HighAttack = P_A
P_Legendary_given_HighAttack = (P_HighAttack_given_Legendary * P_Legendary) / P_HighAttack if P_HighAttack != 0 else 0
# --- グラフ表示:棒グラフ / Bar Chart ---
labels = ['P(Attack>=100)', 'P(Speed>90)', 'P(Attack>=100 | Speed>90)', 'P(Legendary | Attack>=100)']
values = [P_high_attack, P_B, P_A_given_B, P_Legendary_given_HighAttack]
plt.figure(figsize=(8, 5))
plt.bar(labels, values, color='skyblue')
plt.ylim(0, 1)
plt.title('Probability Comparison')
plt.ylabel('Probability')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# --- モンテカルロシミュレーション / Monte Carlo Simulation ---
np.random.seed(42)
sim_data = pd.DataFrame({
'Attack': np.random.randint(30, 150, 1000),
'Speed': np.random.randint(30, 150, 1000)
})
sim_data['HighAttack'] = sim_data['Attack'] >= 100
sim_data['HighSpeed'] = sim_data['Speed'] > 90
P_mc = (sim_data['HighAttack'] & sim_data['HighSpeed']).mean()
# --- ベイズ推定(ベータ分布)/ Bayesian Estimation using Beta ---
successes = joint.shape[0]
trials = high_speed.shape[0]
a, b = successes + 1, trials - successes + 1
x = np.linspace(0, 1, 500)
y = beta.pdf(x, a, b)
plt.figure(figsize=(8, 5))
plt.plot(x, y, label=f'Beta({a},{b})')
plt.title('Posterior Distribution (Bayes Estimate)')
plt.xlabel('P(Attack >= 100 | Speed > 90)')
plt.ylabel('Density')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# --- 結果出力 / Output ---
probability_summary = {
'P(Attack >= 100)': round(P_high_attack, 3),
'P(Speed > 90)': round(P_B, 3),
'P(Attack >= 100 | Speed > 90)': round(P_A_given_B, 3),
'Independence (A ⊥ B)': independent,
'P(Legendary | Attack >= 100)': round(P_Legendary_given_HighAttack, 3),
'Monte Carlo P(A ∩ B)': round(P_mc, 3)
}
probability_summary
# プログラム名: pokemon_probability_chapter3.py
# Program Name: pokemon_probability_chapter3.py
# 概要: ポケモンデータを用いて第3章「確率変数」の統計分析と分布を可視化
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, binom, poisson
# --- データ定義 / Define Pokémon dataset ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo'],
'Attack': [55, 84, 110, 49, 110],
'Speed': [90, 100, 30, 45, 130],
}
df = pd.DataFrame(data)
# --- 3-1: 離散型・連続型確率変数 / Discrete & Continuous Variables ---
attack_counts = df['Attack'].value_counts(normalize=True).sort_index()
plt.figure(figsize=(8, 4))
attack_counts.plot(kind='bar', color='skyblue')
plt.title('PMF: Relative Frequency of Attack')
plt.xlabel('Attack Value')
plt.ylabel('Probability')
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 3-2: PDF/CDF with Normal Distribution ---
x = np.linspace(df['Attack'].min(), df['Attack'].max(), 200)
mu, sigma = df['Attack'].mean(), df['Attack'].std()
pdf = norm.pdf(x, mu, sigma)
cdf = norm.cdf(x, mu, sigma)
plt.figure(figsize=(8, 4))
plt.hist(df['Attack'], bins=5, density=True, alpha=0.5, label='Data Hist')
plt.plot(x, pdf, 'r-', label='Normal PDF')
plt.title('PDF Comparison: Attack')
plt.xlabel('Attack')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 4))
plt.plot(x, cdf, 'g-', label='Normal CDF')
plt.title('CDF: Normal Distribution of Attack')
plt.xlabel('Attack')
plt.ylabel('Cumulative Probability')
plt.grid(True)
plt.tight_layout()
plt.legend()
plt.show()
# --- 3-3: 期待値・分散 / Expected Value & Variance ---
attack_probs = df['Attack'].value_counts(normalize=True).sort_index()
expected = sum(x * p for x, p in zip(attack_probs.index, attack_probs))
expected_sq = sum((x**2) * p for x, p in zip(attack_probs.index, attack_probs))
variance = expected_sq - expected**2
# --- 3-4: 二項分布とポアソン分布 / Binomial & Poisson Distributions ---
n = 10
p = (df['Attack'] >= 100).mean()
k = np.arange(0, 11)
binom_pmf = binom.pmf(k, n, p)
poisson_pmf = poisson.pmf(k, mu=n * p)
plt.figure(figsize=(8, 4))
plt.stem(k, binom_pmf, linefmt='b-', markerfmt='bo', basefmt='r-', label='Binomial')
plt.stem(k, poisson_pmf, linefmt='g-', markerfmt='go', basefmt='r-', label='Poisson')
plt.title('Binomial vs Poisson (Attack ≥ 100 Occurrence)')
plt.xlabel('Number of High-Attack Pokémon in 10 Trials')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 3-5: 中心極限定理 / Central Limit Theorem (CLT) ---
np.random.seed(0)
sample_means = [np.mean(np.random.choice(df['Attack'], size=5, replace=True)) for _ in range(1000)]
plt.figure(figsize=(8, 4))
sns.histplot(sample_means, bins=20, kde=True, color='purple')
plt.title('Sampling Distribution of the Mean (CLT)')
plt.xlabel('Sample Mean (Attack)')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 結果出力 / Summary ---
summary_stats = {
'Expected Attack': round(expected, 2),
'Variance Attack': round(variance, 2),
'Binomial P(X>=k)': [round(p, 3) for p in binom_pmf.tolist()],
'Poisson Approx.': [round(p, 3) for p in poisson_pmf.tolist()]
}
summary_stats
# プログラム名: pokemon_probability_chapter4.py
# Program Name: pokemon_probability_chapter4.py
# 概要: 第4章「確率分布」- ポケモンデータを使って各種分布を説明・可視化するプログラム
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import bernoulli, binom, geom, hypergeom, poisson
from scipy.stats import uniform, norm, expon, gamma, chi2, beta, t, f
# --- データ定義(簡易) / Define Pokémon data ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo'],
'Attack': [55, 84, 110, 49, 110],
'Speed': [90, 100, 30, 45, 130],
}
df = pd.DataFrame(data)
# --- 4-1: Bernoulli Distribution ---
# ベルヌーイ分布:攻撃力が100以上のポケモンか否か(1か0)
p_bernoulli = (df['Attack'] >= 100).mean()
x = [0, 1]
y = bernoulli.pmf(x, p_bernoulli)
plt.figure()
plt.bar(x, y)
plt.title('Bernoulli Distribution: P(Attack ≥ 100)')
plt.xticks([0, 1], ['Attack < 100', 'Attack ≥ 100'])
plt.ylabel('Probability')
plt.grid(True)
plt.show()
# --- 4-2: Binomial Distribution ---
# 二項分布:10匹中に攻撃100以上が何匹いるかの分布
k = np.arange(0, 11)
binom_probs = binom.pmf(k, n=10, p=p_bernoulli)
plt.figure()
plt.stem(k, binom_probs, basefmt=' ')
plt.title('Binomial Distribution (n=10, p=P(Attack ≥ 100))')
plt.xlabel('Number of High Attack Pokémon')
plt.ylabel('Probability')
plt.grid(True)
plt.show()
# --- 4-3: Geometric Distribution ---
# 幾何分布:初めて攻撃力100以上のポケモンが出現するまでの試行回数
k = np.arange(1, 11)
geom_probs = geom.pmf(k, p_bernoulli)
plt.figure()
plt.stem(k, geom_probs, basefmt=' ')
plt.title('Geometric Distribution: Trials until first Attack ≥ 100')
plt.xlabel('Trial Number')
plt.ylabel('Probability')
plt.grid(True)
plt.show()
# --- 4-4: Hypergeometric Distribution ---
# 超幾何分布:母集団20匹中5匹が伝説。そこから5匹引いたときにn匹が伝説である確率
M = 20 # 全体のポケモン数
n = 5 # 伝説ポケモンの数
N = 5 # 抽出するポケモン数
k = np.arange(0, 6)
hyper_probs = hypergeom.pmf(k, M, n, N)
plt.figure()
plt.stem(k, hyper_probs, basefmt=' ')
plt.title('Hypergeometric Distribution: Legendary Pokémon Draws')
plt.xlabel('Number of Legendary Pokémon')
plt.ylabel('Probability')
plt.grid(True)
plt.show()
# --- 4-5: Poisson Distribution ---
# ポアソン分布:攻撃力100以上のポケモンが1ターンあたり平均2.5回出現するモデル
k = np.arange(0, 11)
poisson_probs = poisson.pmf(k, mu=2.5)
plt.figure()
plt.stem(k, poisson_probs, basefmt=' ')
plt.title('Poisson Distribution (λ=2.5 for Attack ≥ 100)')
plt.xlabel('Occurrences')
plt.ylabel('Probability')
plt.grid(True)
plt.show()
# --- 4-6: Uniform Distribution ---
# 一様分布:攻撃力が0〜150の範囲で一様に分布する仮定
x = np.linspace(0, 150, 500)
uniform_pdf = uniform.pdf(x, loc=0, scale=150)
plt.figure()
plt.plot(x, uniform_pdf)
plt.title('Uniform Distribution (Attack Range 0 to 150)')
plt.xlabel('Attack')
plt.ylabel('Density')
plt.grid(True)
plt.show()
# --- 4-7: Normal Distribution ---
# 正規分布:攻撃力が平均と標準偏差を持つ正規分布に従うと仮定
mu = df['Attack'].mean()
sigma = df['Attack'].std()
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 500)
pdf = norm.pdf(x, mu, sigma)
cdf = norm.cdf(x, mu, sigma)
plt.figure()
plt.plot(x, pdf)
plt.title('Normal Distribution PDF (Pokémon Attack)')
plt.xlabel('Attack')
plt.ylabel('Probability Density')
plt.grid(True)
plt.show()
plt.figure()
plt.plot(x, cdf)
plt.title('Normal Distribution CDF (Pokémon Attack)')
plt.xlabel('Attack')
plt.ylabel('Cumulative Probability')
plt.grid(True)
plt.show()
# --- 4-8: Exponential Distribution ---
# 指数分布:高攻撃力ポケモンが次に現れるまでの待ち時間モデル
lam = 1 / df['Attack'].mean()
x = np.linspace(0, 300, 500)
plt.figure()
plt.plot(x, expon.pdf(x, scale=1/lam))
plt.title('Exponential Distribution (based on Mean Attack)')
plt.xlabel('Turns Until Next High-Attack Pokémon')
plt.ylabel('Probability Density')
plt.grid(True)
plt.show()
# --- 4-9: Chi-squared Distribution ---
# カイ二乗分布:バトルでの攻撃観測データと仮説の整合性を評価
x = np.linspace(0, 20, 500)
plt.figure()
plt.plot(x, chi2.pdf(x, df=4))
plt.title('Chi-squared Distribution (df=4)')
plt.xlabel('Chi-squared Statistic')
plt.ylabel('Probability Density')
plt.grid(True)
plt.show()
# --- 4-10: Beta Distribution ---
# ベータ分布:捕獲成功率や命中率など0〜1の確率値をモデリング
x = np.linspace(0, 1, 500)
plt.figure()
plt.plot(x, beta.pdf(x, a=2, b=5))
plt.title('Beta Distribution (α=2, β=5)')
plt.xlabel('Probability of Capture')
plt.ylabel('Density')
plt.grid(True)
plt.show()
# --- 4-11: t-distribution ---
# t分布:少数サンプル(例:5匹)の平均攻撃力の仮説検定に利用
x = np.linspace(-5, 5, 500)
plt.figure()
plt.plot(x, t.pdf(x, df=4))
plt.title('t Distribution (df=4)')
plt.xlabel('t-value')
plt.ylabel('Probability Density')
plt.grid(True)
plt.show()
# --- 4-12: F-distribution ---
# F分布:異なるポケモングループの攻撃力のばらつき(分散)を比較
x = np.linspace(0, 5, 500)
plt.figure()
plt.plot(x, f.pdf(x, dfn=5, dfd=2))
plt.title('F Distribution (dfn=5, dfd=2)')
plt.xlabel('F-statistic')
plt.ylabel('Probability Density')
plt.grid(True)
plt.show()
# プログラム名: pokemon_sampling_theory_chapter5.py
# --- 5-1: Sampling Distribution of Mean & Variance ---
# 攻撃力が正規分布に従うと仮定し、標本平均と標本分散の分布を可視化
sample_size = 5
num_samples = 1000
mu, sigma = df['Attack'].mean(), df['Attack'].std()
sample_means = []
sample_vars = []
for _ in range(num_samples):
sample = np.random.normal(loc=mu, scale=sigma, size=sample_size)
sample_means.append(np.mean(sample))
sample_vars.append(np.var(sample, ddof=1))
plt.figure()
sns.histplot(sample_means, bins=30, kde=True, color='skyblue')
plt.title('Sampling Distribution of the Mean (Attack)')
plt.xlabel('Sample Mean')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
plt.figure()
sns.histplot(sample_vars, bins=30, kde=True, color='lightgreen')
plt.title('Sampling Distribution of the Variance (Attack)')
plt.xlabel('Sample Variance')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# --- 5-2: Law of Large Numbers ---
# 標本平均が母平均に収束する様子を可視化
samples = np.random.normal(loc=mu, scale=sigma, size=1000)
cumulative_means = np.cumsum(samples) / np.arange(1, 1001)
plt.figure()
plt.plot(cumulative_means, label='Cumulative Mean')
plt.axhline(mu, color='red', linestyle='--', label='True Mean')
plt.title('Law of Large Numbers (Attack)')
plt.xlabel('Number of Samples')
plt.ylabel('Cumulative Mean')
plt.legend()
plt.grid(True)
plt.show()
# --- 5-3: Central Limit Theorem ---
# 攻撃力を一様分布からサンプリングし、標本平均の分布が正規に近づく様子を確認
uniform_samples = np.random.uniform(low=0, high=150, size=(1000, 30))
uniform_sample_means = uniform_samples.mean(axis=1)
plt.figure()
sns.histplot(uniform_sample_means, bins=30, kde=True, color='violet')
plt.title('CLT: Mean of Uniformly Distributed Attack Stats')
plt.xlabel('Sample Mean')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# --- 5-3-1: Normal Approximation to Binomial ---
# 攻撃≥100のポケモンが100匹中に何匹いるか(二項分布)とその正規近似
p_bernoulli = (df['Attack'] >= 100).mean() # ← ここで定義を追加
n = 100
p = p_bernoulli
k = np.arange(0, 41)
binom_probs = binom.pmf(k, n=n, p=p)
normal_approx = norm.pdf(k, loc=n*p, scale=np.sqrt(n*p*(1-p)))
plt.figure()
plt.bar(k, binom_probs, alpha=0.6, label='Binomial')
plt.plot(k, normal_approx, 'r-', label='Normal Approx.')
plt.title('Binomial vs Normal Approximation (Attack ≥ 100)')
plt.xlabel('Number of High Attack Pokémon in 100')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)
plt.show()
# プログラム名: pokemon_sampling_methods_chapter6.py
import pandas as pd
import numpy as np
# --- サンプルデータ定義 / Define sample Pokémon data ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
'Speed': [90, 100, 30, 45, 130, 55, 110, 55]
}
df = pd.DataFrame(data)
# --- 6-1: 全数調査と標本調査 / Census vs. Sampling ---
# 全てのポケモン(全数調査)と一部のポケモン(標本調査)を比較
# 実際には限られたデータしか得られないため、標本調査が一般的
# --- 6-2: 母集団と標本 / Population and Sample ---
# 攻撃力を例に、母集団の平均と標本の平均を比較
population = df['Attack']
sample = df['Attack'].sample(n=3, random_state=42)
print("Population Mean:", population.mean())
print("Sample Mean:", sample.mean())
# --- 6-3: 標本抽出法 / Sampling Methods ---
# 無作為抽出(simple random sampling)の例
random_sample = df.sample(n=3, random_state=1)
print("Random Sample:\n", random_sample)
# 系統抽出(systematic sampling)を模倣(等間隔で抽出)
systematic_sample = df.iloc[::2] # 2行ごとに抽出
print("Systematic Sample:\n", systematic_sample)
# 層化抽出(stratified sampling)はカテゴリ別にグループを作る必要があるため、今回は未実装
# --- 6-4: 乱数 / Random Numbers ---
# 攻撃力に基づいたランダム抽出と乱数生成の例
np.random.seed(0)
random_attacks = np.random.choice(df['Attack'], size=10, replace=True)
print("Randomly Selected Attack Values:", random_attacks)
# プログラム名: pokemon_statistical_inference_chapter7.py
import pandas as pd
import numpy as np
from scipy import stats
# --- サンプルデータ定義 / Define sample Pokémon data ---
# 各ポケモンの攻撃力(Attack)と素早さ(Speed)を使って統計的推定を行う
# このデータは母集団の一部(標本)と見なして、推定や信頼区間を求める
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
'Speed': [90, 100, 30, 45, 130, 55, 110, 55]
}
df = pd.DataFrame(data)
# --- 7-1: 点推定 / Point Estimation ---
# 攻撃力の標本平均から母平均(全ポケモンの攻撃傾向)を点推定する
point_estimate = df['Attack'].mean()
print("Point Estimate (mean Attack):", point_estimate)
# --- 7-1-1: モーメント法 / Method of Moments ---
# 標本の1次・2次モーメントを使って、ポケモン全体の攻撃力の分布を推定する
moment_mean = np.mean(df['Attack'])
moment_var = np.var(df['Attack'], ddof=0)
print("Moment Estimate μ:", moment_mean)
print("Moment Estimate σ²:", moment_var)
# --- 7-1-2: 最尤法 / Maximum Likelihood Estimation ---
# 最尤法により正規分布の母平均と分散を推定(実はモーメント法と一致する)
mle_mean = np.mean(df['Attack'])
mle_var = np.var(df['Attack'], ddof=0)
print("MLE Estimate μ:", mle_mean)
print("MLE Estimate σ²:", mle_var)
# --- 7-2: 点推定の基準 / Criteria ---
# 不偏性や一致性の観点から標本平均や不偏分散を計算
print("Sample Mean:", np.mean(df['Attack']))
print("Sample Variance (unbiased):", np.var(df['Attack'], ddof=1))
# --- 7-3: 区間推定 / Interval Estimation ---
sample = df['Attack']
n = len(sample)
mean = np.mean(sample)
std = np.std(sample, ddof=1)
# --- 7-3-1: 母平均の区間推定(母分散が既知) ---
# σが分かっている前提で、攻撃力の母平均の95%信頼区間を計算
sigma_known = std # 仮定として標本の標準偏差を母分散とする
z = stats.norm.ppf(0.975)
ci_known = (mean - z * sigma_known / np.sqrt(n), mean + z * sigma_known / np.sqrt(n))
print("95% CI (σ known):", ci_known)
# --- 7-3-2: 母平均の区間推定(母分散が未知) ---
# 現実的な仮定:σが不明なため、t分布を用いて推定
t = stats.t.ppf(0.975, df=n-1)
ci_unknown = (mean - t * std / np.sqrt(n), mean + t * std / np.sqrt(n))
print("95% CI (σ unknown):", ci_unknown)
# --- 7-3-3: 2つの母平均の差の区間推定(等分散) ---
# GengarとMachampの攻撃力の差が有意かを評価(等分散を仮定)
x1 = df[df['Name'] == 'Gengar']['Attack']
x2 = df[df['Name'] == 'Machamp']['Attack']
d1, d2 = len(x1), len(x2)
s1, s2 = np.var(x1, ddof=1), np.var(x2, ddof=1)
pool_var = ((d1 - 1) * s1 + (d2 - 1) * s2) / (d1 + d2 - 2)
t_val = stats.t.ppf(0.975, df=d1 + d2 - 2)
diff_mean = x1.mean() - x2.mean()
ci_diff_equal_var = (diff_mean - t_val * np.sqrt(pool_var*(1/d1 + 1/d2)),
diff_mean + t_val * np.sqrt(pool_var*(1/d1 + 1/d2)))
print("95% CI (equal var):", ci_diff_equal_var)
# --- 7-3-4: 2つの母平均の差の区間推定(等分散でない) ---
# Welchのt検定:分散が異なる場合でも平均の差の信頼区間を求められる
s1, s2 = np.var(x1, ddof=1), np.var(x2, ddof=1)
se = np.sqrt(s1/d1 + s2/d2)
df_welch = (s1/d1 + s2/d2)**2 / ((s1**2)/(d1**2 * (d1-1)) + (s2**2)/(d2**2 * (d2-1)))
t_welch = stats.t.ppf(0.975, df=df_welch)
ci_diff_unequal_var = (diff_mean - t_welch * se, diff_mean + t_welch * se)
print("95% CI (unequal var):", ci_diff_unequal_var)
# --- 7-3-5: 母分散の区間推定 ---
# ポケモンたちの攻撃力のばらつき(分散)について、母分散の信頼区間を求める
chi2_lower = stats.chi2.ppf(0.025, df=n-1)
chi2_upper = stats.chi2.ppf(0.975, df=n-1)
ci_var = ((n-1)*std**2 / chi2_upper, (n-1)*std**2 / chi2_lower)
print("95% CI of Variance:", ci_var)
# --- 7-3-6: 二項分布の母数の信頼区間 ---
# 攻撃力が90以上のポケモン(例:バトルで即戦力)割合の信頼区間を推定
success = (df['Attack'] >= 90).sum()
total = len(df)
p_hat = success / total
z = stats.norm.ppf(0.975)
ci_binomial = (p_hat - z * np.sqrt(p_hat*(1-p_hat)/total),
p_hat + z * np.sqrt(p_hat*(1-p_hat)/total))
print("95% CI for proportion (Attack ≥ 90):", ci_binomial)
# --- 7-3-7: 信頼区間の意味 ---
# この信頼区間は「母数が含まれる確率」ではなく、
# 長期的に見て得られる区間の95%が真の母数を含む、という意味を持つ
# 例:100回このような推定をしたとき、およそ95回は真の平均値が区間内にある
# プログラム名: pokemon_hypothesis_testing_chapter8.py
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
# --- ポケモンの攻撃力と素早さのデータを準備 ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
'Speed': [90, 100, 30, 45, 130, 55, 110, 55]
}
df = pd.DataFrame(data)
# --- 8-2-1: 母平均の検定(母分散既知) ---
mu_0 = 80
sigma = np.std(df['Attack'], ddof=0)
n = len(df)
x_bar = np.mean(df['Attack'])
z = (x_bar - mu_0) / (sigma / np.sqrt(n))
p_val_z = 2 * (1 - stats.norm.cdf(abs(z)))
print("8-2-1 Z-test (σ known): z =", z, ", p-value =", p_val_z)
# プロット:Z検定の正規分布と検定統計量
x = np.linspace(-4, 4, 200)
y = stats.norm.pdf(x)
plt.plot(x, y, label='Standard Normal Distribution')
plt.axvline(z, color='red', linestyle='--', label=f'z = {z:.2f}')
plt.axvline(-z, color='red', linestyle='--')
plt.title("Z-test for Mean (σ known)")
plt.xlabel("z")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()
# --- 8-2-2: 母平均の検定(母分散未知) ---
t = (x_bar - mu_0) / (np.std(df['Attack'], ddof=1) / np.sqrt(n))
p_val_t = 2 * (1 - stats.t.cdf(abs(t), df=n-1))
print("8-2-2 T-test (σ unknown): t =", t, ", p-value =", p_val_t)
# プロット:T検定のt分布と検定統計量
t_range = np.linspace(-4, 4, 200)
t_pdf = stats.t.pdf(t_range, df=n-1)
plt.plot(t_range, t_pdf, label=f't-distribution (df={n-1})')
plt.axvline(t, color='red', linestyle='--', label=f't = {t:.2f}')
plt.axvline(-t, color='red', linestyle='--')
plt.title("T-test for Mean (σ unknown)")
plt.xlabel("t")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()
# --- 8-2-3: Two-sample t-test (equal variance) ---
g1 = df[df['Name'] == 'Gengar']['Attack']
g2 = df[df['Name'] == 'Machamp']['Attack']
t_stat_eq, p_val_eq = stats.ttest_ind(g1, g2, equal_var=True)
print("8-2-3 Equal Variance T-test: t =", t_stat_eq, ", p =", p_val_eq)
# --- 8-2-4: Welch’s t-test (unequal variance) ---
t_stat_uneq, p_val_uneq = stats.ttest_ind(g1, g2, equal_var=False)
print("8-2-4 Welch T-test: t =", t_stat_uneq, ", p =", p_val_uneq)
# --- 8-3-2: Chi-squared test for variance (mean unknown) ---
sample_var = np.var(df['Attack'], ddof=1)
sigma0_sq = 100**2 # Hypothesized variance
chi2_stat = (n - 1) * sample_var / sigma0_sq
p_var = 2 * min(stats.chi2.cdf(chi2_stat, df=n-1), 1 - stats.chi2.cdf(chi2_stat, df=n-1))
print("8-3-2 Chi-squared test for variance: χ² =", chi2_stat, ", p =", p_var)
# --- 8-3-3: F-test for equality of variances ---
var_g1 = np.var(g1, ddof=1)
var_g2 = np.var(g2, ddof=1)
f_stat = var_g1 / var_g2
dfn, dfd = len(g1)-1, len(g2)-1
p_f = 2 * min(stats.f.cdf(f_stat, dfn, dfd), 1 - stats.f.cdf(f_stat, dfn, dfd))
print("8-3-3 F-test for equal variance: F =", f_stat, ", p =", p_f)
# --- 8-5: Proportion test (e.g., strong Pokémon rate ≥ 90 attack) ---
success = (df['Attack'] >= 90).sum()
p0 = 0.5
phat = success / n
z_prop = (phat - p0) / np.sqrt(p0 * (1 - p0) / n)
p_prop = 2 * (1 - stats.norm.cdf(abs(z_prop)))
print("8-5 Proportion Z-test: z =", z_prop, ", p =", p_prop)
# --- 8-6: Correlation test (Attack vs Speed) ---
r, _ = stats.pearsonr(df['Attack'], df['Speed'])
t_corr = r * np.sqrt((n - 2) / (1 - r**2))
p_corr = 2 * (1 - stats.t.cdf(abs(t_corr), df=n-2))
print("8-6 Correlation test: r =", r, ", t =", t_corr, ", p =", p_corr)
# Plot: Scatterplot of Attack vs Speed
plt.scatter(df['Attack'], df['Speed'])
plt.title("Attack vs Speed")
plt.xlabel("Attack")
plt.ylabel("Speed")
plt.grid(True)
plt.show()
# --- 8-7-1: Goodness-of-fit test (Chi-square) ---
# Categorize Attack into bins
attack_bins = pd.cut(df['Attack'], bins=[0, 60, 90, 120], labels=["Low", "Mid", "High"])
observed = attack_bins.value_counts().sort_index()
expected = [len(df)/3] * 3
chi2_gof, p_gof = stats.chisquare(f_obs=observed, f_exp=expected)
print("8-7-1 Goodness-of-fit: χ² =", chi2_gof, ", p =", p_gof)
# --- 8-7-2: Chi-squared test of independence ---
speed_bins = pd.cut(df['Speed'], bins=[0, 60, 90, 150], labels=["Slow", "Average", "Fast"])
ctable = pd.crosstab(attack_bins, speed_bins)
chi2_indep, p_indep, _, _ = stats.chi2_contingency(ctable)
print("8-7-2 Test of independence: χ² =", chi2_indep, ", p =", p_indep)
# --- 8-8: Type I and Type II Errors (comment only) ---
print(\"8-8 Type I error (false positive): Rejecting H0 when it's actually true.\")
print(\"8-8 Type II error (false negative): Failing to reject H0 when H1 is true.\")
# プログラム名: pokemon_nonparametric_bootstrap_chapter9_10.py
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from itertools import permutations
# --- データ準備(ポケモンの攻撃力と素早さ) ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee', 'Gengar', 'Machamp'],
'Attack': [55, 84, 110, 49, 110, 55, 65, 100],
'Speed': [90, 100, 30, 45, 130, 55, 110, 55]
}
df = pd.DataFrame(data)
# GengarとMachampを比較対象に設定
x = df[df['Name'].isin(['Gengar'])]['Attack'].values
y = df[df['Name'].isin(['Machamp'])]['Attack'].values
# --- 9-1-1: Fisherの並べ替え検定 ---
diff_obs = abs(np.mean(x) - np.mean(y))
combined = np.concatenate([x, y])
all_perms = list(permutations(combined, len(x)))
perm_diffs = [abs(np.mean(p) - np.mean([i for i in combined if i not in p])) for p in all_perms]
p_fisher = np.mean([diff >= diff_obs for diff in perm_diffs])
print("9-1-1 Fisher Permutation Test: p =", p_fisher)
# --- 9-1-2: Wilcoxon順位和検定 ---
w_stat, p_wilcoxon = stats.ranksums(x, y)
print("9-1-2 Wilcoxon Rank-Sum Test: W =", w_stat, ", p =", p_wilcoxon)
# --- 9-2-1: Mood検定(分散の違い) ---
mood_stat, p_mood = stats.mood(x, y)
print("9-2-1 Mood Test: stat =", mood_stat, ", p =", p_mood)
# --- 9-2-2: Ansari-Bradley検定(散らばりの違い) ---
ab_stat, p_ab = stats.ansari(x, y)
print("9-2-2 Ansari-Bradley Test: stat =", ab_stat, ", p =", p_ab)
# --- 9-3: Brunner-Munzel検定(位置とスケールの違い) ---
bm_stat, p_bm = stats.brunnermunzel(x, y)
print("9-3 Brunner-Munzel Test: BM =", bm_stat, ", p =", p_bm)
# --- 10-1: パーミュテーション検定(簡易) ---
# Fisher検定で代用済み(上記 9-1-1)
# --- 10-2: ブートストラップ信頼区間 ---
bs_means = [np.mean(np.random.choice(x, size=len(x), replace=True)) for _ in range(1000)]
ci_boot = np.percentile(bs_means, [2.5, 97.5])
print("10-2 Bootstrap CI for Gengar Attack:", ci_boot)
plt.hist(bs_means, bins=30, color='lightgreen', edgecolor='black')
plt.axvline(ci_boot[0], color='blue', linestyle='--', label=f'2.5% = {ci_boot[0]:.2f}')
plt.axvline(ci_boot[1], color='blue', linestyle='--', label=f'97.5% = {ci_boot[1]:.2f}')
plt.title("10-2 Bootstrap Distribution of Mean (Gengar)")
plt.xlabel("Bootstrapped Mean Attack")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()
# --- 10-3: ブートストラップ検定(平均差) ---
bs_diffs = [np.mean(np.random.choice(x, size=len(x), replace=True)) -
np.mean(np.random.choice(y, size=len(y), replace=True)) for _ in range(1000)]
p_boot_test = np.mean([abs(d) >= abs(diff_obs) for d in bs_diffs])
print("10-3 Bootstrap Test of Mean Difference: p =", p_boot_test)
plt.hist(bs_diffs, bins=30, color='lightblue', edgecolor='black')
plt.axvline(diff_obs, color='red', linestyle='--', label=f'Observed Diff = {diff_obs:.2f}')
plt.title("10-3 Bootstrap Distribution of Mean Difference")
plt.xlabel("Mean Difference")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rv_discrete
from sympy import symbols, exp, simplify
# プログラム名: pokemon_statistical_foundations_ch1_ch2.py
# 内容: ポケモンの種族値データを使い、確率、期待値、分散、確率分布、モーメント母関数を統合的に分析
# --- データ定義 ---
pokemon_stats = {
'Name': ['フシギダネ', 'ヒトカゲ', 'ゼニガメ', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'ミュウツー'],
'HP': [45, 39, 44, 35, 60, 160, 106],
'Attack': [49, 52, 48, 55, 90, 110, 110],
'Defense': [49, 43, 65, 40, 55, 65, 90]
}
df = pd.DataFrame(pokemon_stats)
# --- 共通設定 ---
stat_names = ['HP', 'Attack', 'Defense']
summary_combined = {}
for stat in stat_names:
values = df[stat].values
probs = np.ones_like(values) / len(values) # 等確率(仮定)
# --- 期待値・分散・標準偏差 ---
E_val = np.sum(values * probs)
Var_val = np.sum((values - E_val)**2 * probs)
std_val = np.sqrt(Var_val)
# --- MGF計算 ---
t = symbols('t')
MGF_expr = sum(p * exp(t * x) for x, p in zip(values, probs))
MGF_simplified = simplify(MGF_expr)
# --- CDF(累積分布)プロット ---
sorted_values = np.sort(values)
cdf = np.cumsum(probs[np.argsort(values)])
plt.step(sorted_values, cdf, where='post', label=f'CDF of {stat}')
plt.title(f"Cumulative Distribution Function of {stat}")
plt.xlabel(stat)
plt.ylabel("P(X ≤ x)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# --- PMF(棒グラフ) ---
plt.bar(values, probs, color='skyblue', edgecolor='black')
plt.title(f"Discrete Probability Function of {stat}")
plt.xlabel(stat)
plt.ylabel("P(X = x)")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()
# --- サマリー保存 ---
summary_combined[stat] = {
"Expected Value": E_val,
"Variance": Var_val,
"Standard Deviation": std_val,
"MGF": MGF_simplified
}
summary_combined
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import PowerTransformer
# プログラム名: pokemon_probability_distribution_analysis_ch1_to_ch4.py
# 内容: 初代ポケモンのデータを用いて、確率・分布の特性・変数変換を一括分析
# --- データの準備(初代ポケモンの一部) ---
pokemon_data = {
'Name': ['フシギダネ', 'ヒトカゲ', 'ゼニガメ', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'ミュウツー'],
'HP': [45, 39, 44, 35, 60, 160, 106],
'Attack': [49, 52, 48, 55, 90, 110, 110],
'Defense': [49, 43, 65, 40, 55, 65, 90]
}
df = pd.DataFrame(pokemon_data)
stats = ['HP', 'Attack', 'Defense']
summary_stats = {}
# --- 各ステータスに対して統計分析 ---
for stat in stats:
x = df[stat]
summary_stats[stat] = {
'Mean': np.mean(x),
'Median': np.median(x),
'Mode': x.mode().values[0],
'Range': np.ptp(x),
'Q1': np.percentile(x, 25),
'Q3': np.percentile(x, 75),
'IQR': np.percentile(x, 75) - np.percentile(x, 25),
'Variance': np.var(x, ddof=1),
'StdDev': np.std(x, ddof=1),
'CV': np.std(x, ddof=1) / np.mean(x),
'Skewness': skew(x),
'Kurtosis': kurtosis(x)
}
# --- Box-Cox変換(正規化) ---
pt = PowerTransformer(method='box-cox')
transformed = pt.fit_transform(x.values.reshape(-1, 1)).flatten()
df[f'{stat}_BoxCox'] = transformed
# --- 分布の可視化 ---
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(x, kde=True)
plt.title(f'{stat} - Original Distribution')
plt.subplot(1, 2, 2)
sns.histplot(transformed, kde=True, color='orange')
plt.title(f'{stat} - Box-Cox Transformed')
plt.tight_layout()
plt.show()
# --- 共分散行列 & 相関行列 ---
cov_matrix = df[stats].cov()
corr_matrix = df[stats].corr()
# --- ヒートマップ表示 ---
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.heatmap(cov_matrix, annot=True, cmap='Blues')
plt.title("Covariance Matrix")
plt.subplot(1, 2, 2)
sns.heatmap(corr_matrix, annot=True, cmap='Reds')
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()
summary_stats
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint, binom, poisson, geom, nbinom, hypergeom
from scipy.stats import uniform, norm, expon, gamma, beta
# --- プログラム名 / Program Name ---
# pokemon_probability_distributions.py
# 内容: ポケモンの設定に基づいた代表的な確率分布の可視化
# Description: Visualizing key probability distributions in Pokémon context
x = np.arange(0, 21)
# === 離散型分布 / Discrete Distributions ===
# 1. 離散一様分布: サイコロの目 (1~6)
x_uniform = np.arange(1, 7)
pmf_uniform = [1/6] * 6
# 2. 二項分布: 10回の攻撃で状態異常(成功率0.3)
pmf_binom = binom.pmf(x, n=10, p=0.3)
# 3. ポアソン分布: 1分間に遭遇する野生ポケモン数(λ=3)
pmf_poisson = poisson.pmf(x, mu=3)
# 4. 幾何分布: 初めて色違いに遭遇するまでの失敗回数(p=1/4096)
x_geom = np.arange(1, 300)
pmf_geom = geom.pmf(x_geom, p=1/4096)
# 5. 負の二項分布: 3回捕獲成功までの失敗回数(p=0.2)
pmf_nbinom = nbinom.pmf(x, n=3, p=0.2)
# 6. 超幾何分布: 30匹の中から5匹伝説を引く確率(N=30, K=5, n=10)
x_hyper = np.arange(0, 6)
pmf_hyper = hypergeom.pmf(x_hyper, M=30, n=5, N=10)
# === 連続型分布 / Continuous Distributions ===
# 7. 連続一様分布: HP個体値(0~31)
x_uni = np.linspace(0, 31, 500)
pdf_uniform = uniform.pdf(x_uni, loc=0, scale=31)
# 8. 正規分布: HPのばらつき μ=80, σ=10
x_norm = np.linspace(40, 120, 500)
pdf_norm = norm.pdf(x_norm, loc=80, scale=10)
# 9. 指数分布: 色違い出現までの時間(λ=1/300)
x_exp = np.linspace(0, 1500, 500)
pdf_exp = expon.pdf(x_exp, scale=300)
# 10. ガンマ分布: 連続技の威力(形状2, 尺度10)
x_gamma = np.linspace(0, 100, 500)
pdf_gamma = gamma.pdf(x_gamma, a=2, scale=10)
# 11. ベータ分布: 命中率の信頼度分布(a=2, b=5)
x_beta = np.linspace(0, 1, 500)
pdf_beta = beta.pdf(x_beta, a=2, b=5)
# === 可視化 / Plotting ===
fig, axs = plt.subplots(6, 2, figsize=(15, 20))
# 離散分布
axs[0, 0].bar(x_uniform, pmf_uniform, color='skyblue')
axs[0, 0].set_title("1. Discrete Uniform (Pokémon Dice Roll)")
axs[0, 1].bar(x, pmf_binom, color='lightgreen')
axs[0, 1].set_title("2. Binomial (Status Effects in Battles)")
axs[1, 0].bar(x, pmf_poisson, color='salmon')
axs[1, 0].set_title("3. Poisson (Wild Pokémon Encounters)")
axs[1, 1].plot(x_geom[:200], pmf_geom[:200], color='gold')
axs[1, 1].set_title("4. Geometric (First Shiny Encounter)")
axs[2, 0].bar(x, pmf_nbinom, color='plum')
axs[2, 0].set_title("5. Negative Binomial (Successful Captures)")
axs[2, 1].bar(x_hyper, pmf_hyper, color='lightcoral')
axs[2, 1].set_title("6. Hypergeometric (Legendary Pokémon Selection)")
# 連続分布
axs[3, 0].plot(x_uni, pdf_uniform, color='skyblue')
axs[3, 0].set_title("7. Continuous Uniform (HP IV Distribution)")
axs[3, 1].plot(x_norm, pdf_norm, color='lightgreen')
axs[3, 1].set_title("8. Normal (HP Stat Variation)")
axs[4, 0].plot(x_exp, pdf_exp, color='salmon')
axs[4, 0].set_title("9. Exponential (Time Until Shiny Pokémon)")
axs[4, 1].plot(x_gamma, pdf_gamma, color='gold')
axs[4, 1].set_title("10. Gamma (Multi-Hit Damage Distribution)")
axs[5, 0].plot(x_beta, pdf_beta, color='plum')
axs[5, 0].set_title("11. Beta (Move Accuracy Belief)")
# 非使用領域削除
axs[5, 1].axis('off')
for ax in axs.flat:
ax.grid(True)
plt.tight_layout()
plt.show()
# プログラム名: pokemon_asymptotic_estimation_chapter7_9.py
# 概要: ポケモンを題材に、極限定理、推定法、区間推定をPythonで実装し、統計学の第7~9章を解説
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# --- ポケモンデータ(攻撃値) ---
data = {
'Name': ['Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Gengar', 'Machamp', 'Eevee', 'Blastoise', 'Dragonite'],
'Attack': [55, 84, 110, 49, 110, 65, 100, 55, 83, 134]
}
df = pd.DataFrame(data)
# --- 7章: 極限定理と漸近理論 ---
# 中心極限定理: 攻撃力の標本平均が正規分布に近づく様子
mu, sigma = df['Attack'].mean(), df['Attack'].std(ddof=1)
sample_means = [np.mean(np.random.choice(df['Attack'], 5, replace=True)) for _ in range(1000)]
plt.hist(sample_means, bins=30, density=True, alpha=0.6, color='skyblue')
x = np.linspace(min(sample_means), max(sample_means), 100)
plt.plot(x, stats.norm.pdf(x, loc=mu, scale=sigma/np.sqrt(5)), 'r-', label='Normal Approx')
plt.title('Central Limit Theorem (Sample Mean of Attack)')
plt.xlabel('Sample Mean')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# --- 8章: 推定法の比較 ---
sample = df['Attack']
n = len(sample)
# 点推定(標本平均 = 最尤推定量)
mean_mle = np.mean(sample)
var_mle = np.var(sample, ddof=0)
var_unbiased = np.var(sample, ddof=1)
print("最尤推定(母平均)MLE mean:", mean_mle)
print("最尤推定(母分散)MLE var:", var_mle)
print("不偏分散(unbiased var):", var_unbiased)
# ジャックナイフ推定量(平均のバイアス補正)
jackknife_means = [np.mean(np.delete(sample.values, i)) for i in range(n)]
jackknife_estimate = n * mean_mle - (n - 1) * np.mean(jackknife_means)
print("ジャックナイフ推定(平均):", jackknife_estimate)
# --- 9章: 区間推定 ---
# 攻撃値の95%信頼区間(母平均推定, 分散未知)
t = stats.t.ppf(0.975, df=n-1)
se = stats.sem(sample)
ci = (mean_mle - t * se, mean_mle + t * se)
print("攻撃値の母平均 95%信頼区間 (t分布):", ci)
# 分散の信頼区間(χ2分布使用)
chi2_lower = stats.chi2.ppf(0.025, df=n-1)
chi2_upper = stats.chi2.ppf(0.975, df=n-1)
ci_var = ((n-1)*var_unbiased / chi2_upper, (n-1)*var_unbiased / chi2_lower)
print("攻撃値の母分散 95%信頼区間:", ci_var)
# 攻撃90以上の割合の信頼区間(二項分布)
success = (df['Attack'] >= 90).sum()
p_hat = success / n
z = stats.norm.ppf(0.975)
ci_prop = (p_hat - z * np.sqrt(p_hat*(1-p_hat)/n), p_hat + z * np.sqrt(p_hat*(1-p_hat)/n))
print("攻撃力90以上の割合の信頼区間:", ci_prop)
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・仮説検定・2標本検定・偏差値の算出までを一括で学べる統合プログラム。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind
# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75
]
}
df = pd.DataFrame(pokemon)
hp_sample = df['HP']
n = len(hp_sample)
# --- 点推定 / Point Estimation ---
mean_hp = np.mean(hp_sample)
var_hp = np.var(hp_sample, ddof=1)
std_hp = np.std(hp_sample, ddof=1)
se_hp = std_hp / np.sqrt(n)
print("[18] Point Estimation")
print(f"Sample Size n = {n}")
print(f"Sample Mean (μ̂): {mean_hp:.2f}")
print(f"Unbiased Variance (s²): {var_hp:.2f}")
print(f"Standard Deviation (s): {std_hp:.2f}")
print(f"Standard Error (SE): {se_hp:.2f}\n")
# --- 区間推定(母分散既知) ---
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
print("[19] Confidence Interval (σ known)")
print(f"95% CI: {ci_known}\n")
# --- 区間推定(母分散未知) ---
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
print("[20] Confidence Interval (σ unknown)")
print(f"95% CI: {ci_unknown}\n")
# --- 母比率の区間推定 ---
success = np.sum(hp_sample >= 80)
p_hat = success / n
se_p = np.sqrt(p_hat * (1 - p_hat) / n)
ci_prop = (p_hat - z * se_p, p_hat + z * se_p)
print("[21] Confidence Interval for Proportion (P(HP ≥ 80))")
print(f"Proportion: {p_hat:.2f}, 95% CI: {ci_prop}\n")
# --- 母分散の区間推定 ---
chi2_lower = chi2.ppf(0.025, df=n-1)
chi2_upper = chi2.ppf(0.975, df=n-1)
ci_var = ((n - 1) * var_hp / chi2_upper, (n - 1) * var_hp / chi2_lower)
print("[22] Confidence Interval for Population Variance")
print(f"95% CI for Variance: {ci_var}\n")
# --- 仮説検定: μ = 80か? ---
t_stat, p_val = ttest_1samp(hp_sample, popmean=80)
print("[7章] 仮説検定 H₀: μ = 80")
print(f"t = {t_stat:.3f}, p = {p_val:.4f}\n")
# --- 2標本t検定(HP < 80 vs HP ≥ 80) ---
group_low = df[df['HP'] < 80]['HP']
group_high = df[df['HP'] >= 80]['HP']
t2_stat, p2_val = ttest_ind(group_low, group_high, equal_var=False)
print("[8章] Welchのt検定(HP < 80 vs HP ≥ 80)")
print(f"t = {t2_stat:.3f}, p = {p2_val:.4f}\n")
# --- 偏差値の算出 ---
df['HP_deviation'] = 50 + 10 * (df['HP'] - mean_hp) / std_hp
print("[偏差値] Pokémon HP Deviation Scores")
print(df[['Name', 'HP', 'HP_deviation']].head())
# --- プロット: ヒストグラム ---
plt.figure()
plt.hist(hp_sample, bins=10, color='skyblue', edgecolor='black')
plt.title("Histogram of Pokémon HP")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
# --- プロット: 箱ひげ図 ---
plt.figure()
plt.boxplot(hp_sample, vert=False, patch_artist=True)
plt.title("Boxplot of Pokémon HP")
plt.xlabel("HP")
plt.tight_layout()
plt.show()
# --- プロット: 偏差値バー ---
plt.figure(figsize=(10, 6))
plt.bar(df['Name'], df['HP_deviation'], color='purple')
plt.xticks(rotation=90)
plt.ylabel("HP Deviation Score")
plt.title("Pokémon HP Deviation Values")
plt.tight_layout()
plt.show()
# --- プロット: 2群の平均+誤差バー ---
mean_low = group_low.mean()
mean_high = group_high.mean()
std_low = group_low.std()
std_high = group_high.std()
plt.figure()
plt.bar(['HP < 80', 'HP ≥ 80'], [mean_low, mean_high], yerr=[std_low, std_high], capsize=10, color=['lightcoral', 'lightgreen'])
plt.ylabel("Mean HP")
plt.title("Group Comparison: Mean HP")
plt.tight_layout()
plt.show()
# --- プロット: 信頼区間可視化(Z vs t) ---
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()
# Program Name: pokemon_population_estimation.py
# Creation Date: 20250420
# Overview: ポケモンの種族値を用いて、統計学における点推定・区間推定・仮説検定・2標本検定・偏差値の算出・分散分析・最尤推定までを一括で学べる統合プログラム。
# Usage: Run with `python pokemon_population_estimation.py`
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, chi2, ttest_1samp, ttest_ind, f_oneway
from scipy.optimize import minimize
# --- Step 0: データ定義 / Pokémon Base Stats ---
pokemon = {
'Name': [
'Pikachu', 'Charizard', 'Snorlax', 'Bulbasaur', 'Mewtwo', 'Eevee',
'Gengar', 'Machamp', 'Lapras', 'Alakazam', 'Onix', 'Jigglypuff',
'Gyarados', 'Dragonite', 'Scyther', 'Hitmonlee', 'Arcanine', 'Venusaur',
'Blastoise', 'Raichu', 'Nidoking', 'Clefable', 'Ninetales', 'Slowbro',
'Magneton', 'Dodrio', 'Muk', 'Cloyster', 'Hypno', 'Kingler',
'Electrode', 'Exeggutor', 'Marowak', 'Rhydon', 'Seadra', 'Tauros'
],
'HP': [
35, 78, 160, 45, 106, 55, 60, 90, 130, 55, 35, 115,
95, 91, 70, 50, 90, 80, 79, 60, 81, 95, 73, 95,
50, 60, 105, 50, 85, 55, 60, 95, 60, 105, 55, 75
]
}
df = pd.DataFrame(pokemon)
# --- Group列を生成:一元配置分散分析用カテゴリ付与 ---
group_labels = ['Group A', 'Group B', 'Group C'] * (len(df) // 3) + ['Group A'] * (len(df) % 3)
df['Group'] = group_labels
hp_sample = df['HP']
n = len(hp_sample)
# --- 第9章: 一元配置分散分析(例: HPのグループ差) ---
grouped = df.groupby('Group')['HP'].apply(list)
if len(grouped) >= 2:
f_stat, p_val = f_oneway(*grouped)
print("[9章] 一元配置分散分析 (ANOVA)")
print(f"F = {f_stat:.3f}, p = {p_val:.4f}\n")
else:
print("[9章] ANOVA skipped: Less than 2 groups present.\n")
# --- 第10章: 最尤法の導入 ---
def negative_log_likelihood(params, data):
mu, sigma = params
nll = -np.sum(norm.logpdf(data, loc=mu, scale=sigma))
return nll
initial_guess = [np.mean(hp_sample), np.std(hp_sample)]
result = minimize(negative_log_likelihood, initial_guess, args=(hp_sample,), bounds=[(0, None), (1e-5, None)])
mu_mle, sigma_mle = result.x
print("[10章] 最尤推定 (MLE for Normal Distribution)")
print(f"MLE Mean (μ): {mu_mle:.2f}, MLE Std (σ): {sigma_mle:.2f}\n")
# --- プロット: HPヒストグラム ---
plt.figure()
plt.hist(hp_sample, bins=10, color='skyblue', edgecolor='black')
plt.title("Histogram of Pokémon HP")
plt.xlabel("HP")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 箱ひげ図:HP分布の視覚化 ---
plt.figure()
plt.boxplot(hp_sample, vert=False, patch_artist=True)
plt.title("Boxplot of Pokémon HP")
plt.xlabel("HP")
plt.tight_layout()
plt.show()
# --- 偏差値の棒グラフ表示 ---
df['HP_deviation'] = 50 + 10 * (df['HP'] - df['HP'].mean()) / df['HP'].std()
plt.figure(figsize=(10, 6))
plt.bar(df['Name'], df['HP_deviation'], color='purple')
plt.xticks(rotation=90)
plt.ylabel("HP Deviation Score")
plt.title("Pokémon HP Deviation Values")
plt.tight_layout()
plt.show()
# --- 2群のHP比較:平均+標準偏差付き棒グラフ ---
group_low = df[df['HP'] < 80]['HP']
group_high = df[df['HP'] >= 80]['HP']
mean_low = group_low.mean()
mean_high = group_high.mean()
std_low = group_low.std()
std_high = group_high.std()
plt.figure()
plt.bar(['HP < 80', 'HP ≥ 80'], [mean_low, mean_high], yerr=[std_low, std_high], capsize=10, color=['lightcoral', 'lightgreen'])
plt.ylabel("Mean HP")
plt.title("Group Comparison: Mean HP")
plt.tight_layout()
plt.show()
# --- 信頼区間の視覚化(既存) ---
mean_hp = np.mean(hp_sample)
sigma_known = 30
z = norm.ppf(0.975)
se_known = sigma_known / np.sqrt(n)
ci_known = (mean_hp - z * se_known, mean_hp + z * se_known)
std_hp = np.std(hp_sample, ddof=1)
se_hp = std_hp / np.sqrt(n)
t_score = t.ppf(0.975, df=n-1)
ci_unknown = (mean_hp - t_score * se_hp, mean_hp + t_score * se_hp)
plt.figure(figsize=(10, 6))
plt.axvline(mean_hp, color='blue', linestyle='--', label='Sample Mean')
plt.axvspan(ci_known[0], ci_known[1], color='green', alpha=0.2, label='CI (σ known)')
plt.axvspan(ci_unknown[0], ci_unknown[1], color='orange', alpha=0.2, label='CI (σ unknown)')
plt.title("Confidence Intervals for Population Mean HP")
plt.xlabel("HP")
plt.legend()
plt.tight_layout()
plt.show()
# プログラム名: word_statistics_extended.py
# 概要: TF-IDF分析、感情分析、共起行列の可視化、品詞別頻度分析を追加した単語統計解析プログラム
# Program for extended word statistics: TF-IDF, sentiment analysis, co-occurrence matrix, POS tagging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import seaborn as sns
import networkx as nx
import spacy
# --- サンプル複数文書 / Sample multiple documents ---
documents = [
"Pikachu is quick and cheerful. It uses electricity to shock its opponents.",
"Charizard flies across the sky, breathing fire to defeat its foes.",
"Snorlax is a heavy and sleepy Pokémon that loves to eat and nap."
]
# --- TF-IDF 計算 / Compute TF-IDF ---
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("\n🔍 Top TF-IDF Scores (first document):")
print(tfidf_df.loc[0].sort_values(ascending=False).head(5))
# --- 感情分析 / Sentiment analysis ---
print("\n💬 Sentiment Analysis:")
for i, doc in enumerate(documents):
blob = TextBlob(doc)
print(f"Doc {i+1}: Polarity = {blob.sentiment.polarity:.2f}, Subjectivity = {blob.sentiment.subjectivity:.2f}")
# --- 単語共起行列の計算 / Co-occurrence matrix ---
tokenized = [re.findall(r'\b\w+\b', doc.lower()) for doc in documents]
vocab = sorted(set(word for doc in tokenized for word in doc))
vocab_idx = {word: i for i, word in enumerate(vocab)}
co_matrix = np.zeros((len(vocab), len(vocab)), dtype=int)
for tokens in tokenized:
for i in range(len(tokens) - 1):
w1, w2 = tokens[i], tokens[i+1]
if w1 in vocab_idx and w2 in vocab_idx:
co_matrix[vocab_idx[w1], vocab_idx[w2]] += 1
co_df = pd.DataFrame(co_matrix, index=vocab, columns=vocab)
plt.figure(figsize=(8, 6))
sns.heatmap(co_df, cmap='Blues', square=True)
plt.title("Word Co-occurrence Matrix")
plt.tight_layout()
plt.show()
# --- 共起グラフ描画 / Co-occurrence graph using networkx ---
G = nx.from_pandas_adjacency(co_df)
plt.figure(figsize=(10, 7))
pos = nx.spring_layout(G, k=0.5)
edges = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] > 0]
nx.draw_networkx_nodes(G, pos, node_color='skyblue', node_size=700)
nx.draw_networkx_edges(G, pos, edgelist=edges)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.title("Co-occurrence Word Graph")
plt.axis("off")
plt.tight_layout()
plt.show()
# --- 品詞別の頻度分析 / POS frequency analysis using spaCy ---
nlp = spacy.load("en_core_web_sm")
all_text = " ".join(documents)
doc_spacy = nlp(all_text)
pos_counts = Counter([token.pos_ for token in doc_spacy])
print("\n📘 Part-of-Speech Frequency:")
for pos, count in pos_counts.items():
print(f"{pos}: {count}")
# Program Name: pokemon_pivot_table_analysis.py
# Creation Date: 20250424
# Overview: Create pivot tables to analyze Pokémon base stats by type and generation
# Usage: Run with Python, requires pandas installed
import pandas as pd
# --- ダミーデータの作成 / Sample Pokémon base stat data ---
data = {
'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charmander', 'Charmeleon', 'Charizard'],
'Type': ['Grass', 'Grass', 'Grass', 'Fire', 'Fire', 'Fire'],
'Generation': [1, 1, 1, 1, 1, 1],
'HP': [45, 60, 80, 39, 58, 78],
'Attack': [49, 62, 82, 52, 64, 84],
'Defense': [49, 63, 83, 43, 58, 78]
}
df = pd.DataFrame(data)
# --- ピボットテーブル1: タイプ別の平均ステータス / Mean stats by type ---
pivot_mean_by_type = df.pivot_table(
index='Type',
values=['HP', 'Attack', 'Defense'],
aggfunc='mean'
)
# --- ピボットテーブル2: タイプ×世代別の最大HP / Max HP per type and generation ---
pivot_max_hp = df.pivot_table(
index=['Type', 'Generation'],
values='HP',
aggfunc='max'
)
# --- 結果出力 / Output results ---
print("✅ 平均ステータス(タイプ別) / Mean Stats by Type:")
print(pivot_mean_by_type)
print("\n✅ 最大HP(タイプ×世代) / Max HP by Type and Generation:")
print(pivot_max_hp)
# Program Name: pokemon_statistical_analysis_combined_fixed_v2.py
# Creation Date: 20250424
# Overview: Combined statistical analysis on Pokémon base stats with corrected RMSE calculation
# Usage: Run with Python, requires pandas, scipy, sklearn, matplotlib, seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# --- Sample Pokémon base stats data / ポケモン種族値 ---
pokemon = {
'Name': ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charmander', 'Charmeleon', 'Charizard'],
'Type': ['Grass', 'Grass', 'Grass', 'Fire', 'Fire', 'Fire'],
'HP': [45, 60, 80, 39, 58, 78],
'Attack': [49, 62, 82, 52, 64, 84],
'Defense': [49, 63, 83, 43, 58, 78],
'Sp. Atk': [65, 80, 100, 60, 80, 109],
'Sp. Def': [65, 80, 100, 50, 65, 85],
'Speed': [45, 60, 80, 65, 80, 100]
}
df = pd.DataFrame(pokemon)
# --- Stage 2: Correlation and Covariance / 関係分析 ---
corr_matrix = df.iloc[:, 2:].corr(method='pearson')
cov_matrix = df.iloc[:, 2:].cov()
# --- Stage 3: Estimation / 推定統計 ---
sample_hp = df['HP'].sample(n=4, random_state=1)
mean_hp = sample_hp.mean()
conf_int_hp = stats.t.interval(0.95, df=len(sample_hp)-1,
loc=mean_hp, scale=stats.sem(sample_hp))
# --- Stage 4: Hypothesis Testing / 仮説検定 ---
fire = df[df['Type'] == 'Fire']['Attack']
grass = df[df['Type'] == 'Grass']['Attack']
t_stat, p_value = stats.ttest_ind(fire, grass)
# --- Stage 5: Regression & PCA / 回帰分析と主成分分析 ---
X = df[['HP', 'Defense', 'Speed']]
y = df['Attack']
reg = LinearRegression().fit(X, y)
y_pred = reg.predict(X)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse) # 修正済み RMSE
r2 = r2_score(y, y_pred)
# PCA 次元削減
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df.iloc[:, 2:])
explained_variance = pca.explained_variance_ratio_
# --- Print Results / 結果表示 ---
print("=== Pearson Correlation Matrix ===")
print(corr_matrix)
print("\n=== Covariance Matrix ===")
print(cov_matrix)
print("\n=== Confidence Interval for Sampled HP ===")
print(f"Sample Mean HP: {mean_hp:.2f}")
print(f"95% Confidence Interval: {conf_int_hp}")
print("\n=== T-test: Fire vs Grass (Attack) ===")
print(f"T-statistic = {t_stat:.3f}, P-value = {p_value:.3f}")
print("\n=== Linear Regression: Predicting Attack ===")
print("Coefficients:", reg.coef_)
print("Intercept:", reg.intercept_)
print(f"MAE = {mae:.2f}, RMSE = {rmse:.2f}, R² = {r2:.3f}")
print("\n=== PCA Summary ===")
print("Explained Variance Ratio:", explained_variance)
# --- Plotting / グラフ出力 ---
plt.figure(figsize=(12, 5))
# 相関ヒートマップ
plt.subplot(1, 2, 1)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap (Pearson)")
# PCA プロット
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pd.factorize(df['Type'])[0], cmap='viridis')
plt.title("PCA of Pokémon Stats")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()
# プログラム名: pokemon_data_summary_ch1_plus_regression.py
# 概要: ポケモン第1世代の記述統計 + 単回帰分析(第1章)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
# --- ポケモン第1世代データ ---
data = {
'No': [1, 2, 3, 6, 25, 26, 143, 149, 150],
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Sp. Atk': [65, 80, 100, 109, 50, 90, 65, 100, 154],
'Sp. Def': [65, 80, 100, 85, 50, 80, 110, 100, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130],
'Total': [318, 405, 525, 534, 320, 485, 540, 600, 680]
}
df = pd.DataFrame(data)
stats_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
# --- 基本統計量 ---
mean_vals = df[stats_columns].mean()
median_vals = df[stats_columns].median()
mode_vals = df[stats_columns].mode().iloc[0]
range_vals = df[stats_columns].max() - df[stats_columns].min()
quartiles = df[stats_columns].quantile([0.25, 0.5, 0.75])
std_vals = df[stats_columns].std()
mad_vals = (df[stats_columns] - mean_vals).abs().mean()
# --- 外れ値検出(IQR/Zスコア) ---
outlier_summary = {}
for col in stats_columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
iqr_outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
z_scores = zscore(df[col])
z_outliers = df[(np.abs(z_scores) > 2)]
outlier_summary[col] = {
"IQR Outliers": iqr_outliers['Name'].tolist(),
"Z-score Outliers": z_outliers['Name'].tolist()
}
# --- 可視化(ヒストグラム + 箱ひげ図) ---
for col in stats_columns:
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.histplot(df[col], kde=True)
plt.title(f'Histogram of {col}')
plt.subplot(1, 2, 2)
sns.boxplot(data=df[col], orient='h')
plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()
# --- 単回帰分析(攻撃 → 合計) ---
X = df[['Attack']]
y = df['Total']
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
print("\n=== 単回帰分析(Attack → Total) ===")
print(f"回帰係数(slope): {model.coef_[0]:.3f}")
print(f"切片(intercept): {model.intercept_:.3f}")
# --- 回帰線のプロット ---
plt.figure(figsize=(8, 5))
plt.scatter(df['Attack'], df['Total'], color='blue', label='Data')
plt.plot(df['Attack'], y_pred, color='red', label='Regression Line')
plt.title('Attack vs Total - Linear Regression')
plt.xlabel('Attack')
plt.ylabel('Total')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# --- 統計結果の出力 ---
print("\n=== 基本統計量 ===")
print("平均:\n", mean_vals.round(2))
print("中央値:\n", median_vals)
print("最頻値:\n", mode_vals)
print("範囲:\n", range_vals)
print("\n=== 四分位数 ===")
print(quartiles)
print("\n標準偏差:\n", std_vals)
print("平均絶対偏差:\n", mad_vals)
print("\n=== 外れ値検出結果 ===")
outliers_df = pd.DataFrame(outlier_summary).T
print(outliers_df)
# プログラム名: pokemon_stat_distribution_summary.py
# 概要: ポケモン種族値を用いた記述統計、可視化、相関分析(Google Colab対応)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore, pearsonr, mode
from sklearn.preprocessing import StandardScaler
# --- データ定義 ---
data = {
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Sp. Atk': [65, 80, 100, 109, 50, 90, 65, 100, 154],
'Sp. Def': [65, 80, 100, 85, 50, 80, 110, 100, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130],
'Total': [318, 405, 525, 534, 320, 485, 540, 600, 680]
}
df = pd.DataFrame(data)
stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
# --- スタージェスの公式 ---
def sturges_bins(n):
return int(1 + np.log2(n))
# --- 統計計算と可視化 ---
summary_stats = {}
scaler = StandardScaler()
standardized = scaler.fit_transform(df[stats_cols])
zscore_df = pd.DataFrame(standardized, columns=[f"{col}_zscore" for col in stats_cols])
for col in stats_cols:
# ヒストグラム
plt.figure(figsize=(12, 5))
bins = sturges_bins(len(df[col]))
plt.hist(df[col], bins=bins, edgecolor='black', alpha=0.7)
plt.title(f"{col} - Histogram (Sturges: {bins} bins)")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
# 統計量の集計
col_mean = df[col].mean()
col_median = df[col].median()
col_mode = mode(df[col], keepdims=False).mode
col_var = df[col].var()
col_std = df[col].std()
summary_stats[col] = {
"Mean": col_mean,
"Median": col_median,
"Mode": col_mode,
"Variance": col_var,
"Std Dev": col_std
}
# --- 棒グラフ(量): HP by Pokémon ---
plt.figure(figsize=(10, 5))
sns.barplot(x='Name', y='HP', data=df)
plt.title('HP by Pokémon')
plt.ylabel("HP")
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 折れ線グラフ(変化): カビゴンの種族値 ---
snorlax = df[df["Name"] == "カビゴン"][stats_cols].iloc[0]
plt.figure(figsize=(10, 4))
plt.plot(stats_cols, snorlax.values, marker='o')
plt.title('Snorlax Stat Line')
plt.ylabel("Value")
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 円グラフ(割合): ミュウツーの構成比 ---
mewtwo = df[df["Name"] == "ミュウツー"][stats_cols].iloc[0]
plt.figure(figsize=(6, 6))
plt.pie(mewtwo, labels=stats_cols, autopct='%1.1f%%', startangle=90)
plt.title('Mewtwo Stat Composition')
plt.tight_layout()
plt.show()
# --- 散布図 & 相関係数: Attack vs Total ---
plt.figure(figsize=(8, 5))
plt.scatter(df['Attack'], df['Total'])
plt.title('Attack vs Total')
plt.xlabel('Attack')
plt.ylabel('Total')
r, _ = pearsonr(df['Attack'], df['Total'])
plt.text(min(df['Attack']), max(df['Total']), f"r = {r:.2f}", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 統計結果のDataFrame出力 ---
summary_df = pd.DataFrame(summary_stats).T.round(2)
summary_df['Coefficient of Variation'] = (summary_df['Std Dev'] / summary_df['Mean']).round(2)
# --- Zスコアの統合表示 ---
summary_df_z = zscore_df.copy()
summary_df_z['Name'] = df['Name']
# --- 出力(Colabではprint表示が基本) ---
print("=== ポケモン種族値の統計要約 ===")
print(summary_df)
print("\n=== 標準化(Zスコア)一覧 ===")
print(summary_df_z.set_index('Name').round(2))
# プログラム名: pokemon_stat_integral_methods_summary.py
# 概要: ポケモンSpeedの統計的積分(PDF, CDF, E[X], Var[X], Monte Carlo)を可視化
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.integrate import quad
# --- データ定義 ---
data = {
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130]
}
df = pd.DataFrame(data)
speed_data = df['Speed']
# --- 正規分布のパラメータ(Speed) ---
mu = speed_data.mean()
sigma = speed_data.std()
# --- 1. PDFの積分:P(60 ≤ x ≤ 100) ---
pdf_integral, _ = quad(lambda x: norm.pdf(x, mu, sigma), 60, 100)
# --- 2. CDFから差分で計算 ---
cdf_integral = norm.cdf(100, mu, sigma) - norm.cdf(60, mu, sigma)
# --- 3. 期待値 E[X] = ∫x * f(x) dx ---
expected_value, _ = quad(lambda x: x * norm.pdf(x, mu, sigma), mu - 4*sigma, mu + 4*sigma)
# --- 4. 分散 Var(X) = ∫(x - μ)^2 * f(x) dx ---
variance_value, _ = quad(lambda x: (x - mu)**2 * norm.pdf(x, mu, sigma), mu - 4*sigma, mu + 4*sigma)
# --- 5. モンテカルロ積分近似 ---
samples = np.random.normal(mu, sigma, 10000)
monte_carlo = np.mean((samples >= 60) & (samples <= 100))
# --- 可視化:PDFと積分領域 ---
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 500)
pdf_vals = norm.pdf(x, mu, sigma)
plt.figure(figsize=(10, 5))
plt.plot(x, pdf_vals, label='Normal PDF (Speed)')
plt.fill_between(x, pdf_vals, where=(x >= 60) & (x <= 100), alpha=0.4, color='orange', label='P(60 ≤ x ≤ 100)')
plt.title('Speed Distribution with Integrated Area (60-100)')
plt.xlabel('Speed')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# --- 結果をテーブルとして出力 ---
result = {
"PDF積分 P(60≤x≤100)": round(pdf_integral, 4),
"CDF差分 P(60≤x≤100)": round(cdf_integral, 4),
"期待値 E[X]": round(expected_value, 2),
"分散 Var(X)": round(variance_value, 2),
"モンテカルロ積分近似": round(monte_carlo, 4),
"標本平均 μ": round(mu, 2),
"標本標準偏差 σ": round(sigma, 2)
}
result_df = pd.DataFrame.from_dict(result, orient='index', columns=['値'])
display(result_df)
# プログラム名: pokemon_ml_outlier_combined.py
# 概要: ポケモン種族値で回帰予測・外れ値検出・可視化(Google Colab対応)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# --- データ定義 ---
data = {
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Sp. Atk': [65, 80, 100, 109, 50, 90, 65, 100, 154],
'Sp. Def': [65, 80, 100, 85, 50, 80, 110, 100, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130],
'Total': [318, 405, 525, 534, 320, 485, 540, 600, 680]
}
df = pd.DataFrame(data)
stats_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
# ============================
# ① 外れ値検出(IQR・Zスコア)
# ============================
outliers = []
for col in stats_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
iqr_out = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
z_scores = zscore(df[col])
z_out = df[(np.abs(z_scores) > 2)]
outliers.append({
'Stat': col,
'IQR Outliers': ', '.join(iqr_out['Name']),
'Z-Score Outliers': ', '.join(z_out['Name'])
})
outlier_df = pd.DataFrame(outliers)
print("=== 外れ値検出(IQR法 & Zスコア) ===")
print(outlier_df)
# 可視化:Speedの箱ひげ図
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Speed'])
plt.title('Speed - Boxplot with IQR Outlier Detection')
plt.grid(True)
plt.tight_layout()
plt.show()
# ========================
# ② 回帰モデルによる Total 予測
# ========================
X = df[stats_cols]
y = df['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
models = {
"Linear Regression": LinearRegression(),
"Ridge Regression": Ridge(alpha=1.0),
"Decision Tree": DecisionTreeRegressor(max_depth=3),
"Random Forest": RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42)
}
results = []
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
results.append({'Model': name, 'RMSE': round(rmse, 2), 'R² Score': round(r2, 2)})
results_df = pd.DataFrame(results)
print("\n=== ポケモン種族値による回帰モデル比較 ===")
print(results_df)
# 可視化:線形回帰 Attack vs Total
lin_model = LinearRegression().fit(df[['Attack']], df['Total'])
x_vals = np.linspace(df['Attack'].min(), df['Attack'].max(), 100).reshape(-1, 1)
y_vals = lin_model.predict(x_vals)
plt.figure(figsize=(8, 5))
plt.scatter(df['Attack'], df['Total'], label='Data')
plt.plot(x_vals, y_vals, color='red', label='Linear Regression')
plt.title('Attack vs Total - Linear Regression')
plt.xlabel('Attack')
plt.ylabel('Total')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# プログラム名: pokemon_ml_full_pipeline.py
# 概要: ポケモン種族値を用いたML統合:勾配降下法, ニューラルネット, 交差検証, クラスタリング
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
# --- データ定義 ---
data = {
'Name': ['フシギダネ', 'フシギソウ', 'フシギバナ', 'リザードン', 'ピカチュウ', 'ライチュウ', 'カビゴン', 'カイリュー', 'ミュウツー'],
'HP': [45, 60, 80, 78, 35, 60, 160, 91, 106],
'Attack': [49, 62, 82, 84, 55, 90, 110, 134, 110],
'Defense': [49, 63, 83, 78, 40, 55, 65, 95, 90],
'Sp. Atk': [65, 80, 100, 109, 50, 90, 65, 100, 154],
'Sp. Def': [65, 80, 100, 85, 50, 80, 110, 100, 90],
'Speed': [45, 60, 80, 100, 90, 110, 30, 80, 130],
'Total': [318, 405, 525, 534, 320, 485, 540, 600, 680]
}
df = pd.DataFrame(data)
features = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
X = df[features]
y = df['Total']
# --- 正規化(教師あり・教師なし共通) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# --- ① 損失関数最小化:SGDによる回帰 ---
sgd = SGDRegressor(max_iter=1000, learning_rate='invscaling', eta0=0.01, random_state=42)
sgd.fit(X_scaled, y)
sgd_pred = sgd.predict(X_scaled)
sgd_rmse = np.sqrt(mean_squared_error(y, sgd_pred))
print(f" SGD回帰モデル RMSE(勾配降下法): {sgd_rmse:.2f}")
# --- ② ニューラルネットワーク回帰(誤差逆伝播) ---
mlp = MLPRegressor(hidden_layer_sizes=(6,), activation='relu', max_iter=2000, random_state=42)
mlp.fit(X_scaled, y)
mlp_pred = mlp.predict(X_scaled)
mlp_rmse = np.sqrt(mean_squared_error(y, mlp_pred))
print(f" ニューラルネット(MLP)RMSE: {mlp_rmse:.2f}")
# --- ③ 交差検証(R²スコア) ---
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(mlp, X_scaled, y, scoring='r2', cv=cv)
print(f" 交差検証(MLP)平均R²スコア: {cv_scores.mean():.2f}")
# --- ④ 教師なし学習:KMeansクラスタリング ---
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
df['Cluster'] = cluster_labels
# --- クラスタリング結果の可視化 ---
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='Attack', y='Speed', hue='Cluster', palette='Set2', s=100)
plt.title('KMeans Clustering: Attack vs Speed')
plt.grid(True)
plt.tight_layout()
plt.show()
# --- クラスタ結果の出力 ---
print("\n=== ポケモンとクラスタラベル ===")
print(df[['Name', 'Cluster']])