ポケモン統計学(日記)

Python

Posted at 2025-05-08

# Program Name: pokemon_battle_usage_analysis.py
# Overview: ピカチュウの使用頻度に関するデータを国・地域別に可視化し、中央値を計算する

import pandas as pd
import matplotlib.pyplot as plt

# データ定義（仮想の使用率％データ） / Dummy usage data
data = {
    "Battle Range": ["1-3", "4-6", "7-13", "14-20", "21-27", "28-90", "91+"],
    "Japan": [32.14, 61.30, 5.89, 0.43, 0.10, 0.08, 0.02],
    "USA": [12.53, 54.68, 42.18, 2.04, 0.04, 0.20, 0.32],
    "France": [2.82, 34.65, 49.54, 4.06, 0.04, 0.70, 0.19],
    "Galar": [0.93, 5.26, 38.27, 35.37, 13.78, 0.23, 0.20],
    "Paldea": [7.48, 19.07, 18.02, 45.00, 2.41, 7.02, 0.00]
}

# DataFrameへ変換 / Convert to DataFrame
df = pd.DataFrame(data)

# 国ごとの中央値を計算（順位分布ベース）/ Calculate estimated median from cumulative distribution
def estimated_median(row):
    cumulative = 0
    for i, value in enumerate(row):
        cumulative += value
        if cumulative >= 50:
            return df["Battle Range"][i]
    return "91+"

median_values = {region: estimated_median(df[region]) for region in df.columns if region != "Battle Range"}

# 表示 / Print median results
print("Estimated median battle range for Pikachu usage:")
for region, median in median_values.items():
    print(f"{region}: {median}")

# プロット / Plot usage by region
df.set_index("Battle Range").plot(kind="bar", figsize=(10, 6))
plt.title("Pikachu Usage Distribution by Battle Frequency")
plt.xlabel("Battle Frequency")
plt.ylabel("Usage Percentage (%)")
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# --- 必要なライブラリのインポート / Import necessary libraries ---
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

# --- ダミーデータの作成 / Create dummy data for item prices and quantities ---
data = {
    'Item': ['モンスターボール', 'キズぐすり', 'わざマシン'],
    'Quantity_2015': [6200, 19865, 500],       # 購入数量（2015年） / Quantity in 2015
    'Price_2015': [200, 300, 4000],            # 平均価格（2015年） / Price in 2015
    'Quantity_2016': [6400, 20418, 550],       # 購入数量（2016年） / Quantity in 2016
    'Price_2016': [210, 280, 4200]             # 平均価格（2016年） / Price in 2016
}
df = pd.DataFrame(data)

# --- Laspeyres価格指数の計算 / Calculate Laspeyres Price Index (base: 2015) ---
numerator = (df['Quantity_2015'] * df['Price_2016']).sum()
denominator = (df['Quantity_2015'] * df['Price_2015']).sum()
laspeyres_index = numerator / denominator * 100
print(f"Laspeyres価格指数（2015年基準）: {laspeyres_index:.2f}")

# --- 単回帰モデル：気温と開花日数 / Linear Regression: Temperature vs. Bloom Day ---
temperature = np.array([13, 14, 15, 16, 17, 18, 19])  # 気温 [°C] / Temperature [°C]
bloom_day = np.array([105, 102, 100, 97, 94, 90, 87]) # 開花までの日数 / Days to bloom

# 定数項を追加 / Add intercept term
X = sm.add_constant(temperature)
model = sm.OLS(bloom_day, X).fit()
print(model.summary())

# --- 予測：気温17.5°Cに対する開花日数 / Predict bloom day at 17.5°C ---
X_pred = np.array([[1, 17.5]])  # 明示的に [1, temperature] の形に
predicted_day = model.predict(X_pred)[0]
print(f"2017年予測開花日（日数）: {predicted_day:.1f}日")

# --- 相関係数の計算 / Correlation coefficient ---
correlation = np.corrcoef(temperature, bloom_day)[0, 1]
print(f"相関係数（気温と開花日数）: {correlation:.2f}")

# --- 残差の算出と可視化 / Calculate and visualize residuals ---
residuals = model.resid
plt.figure()
plt.scatter(model.fittedvalues, residuals)
plt.axhline(0, color='gray', linestyle='--')
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.grid(True)
plt.show()

# --- 散布図と回帰直線のプロット / Scatter plot with regression line ---
plt.figure()
plt.scatter(temperature, bloom_day, label='Observed')
plt.plot(temperature, model.predict(X), color='red', label='Regression Line')
plt.xlabel("Average Temperature (°C)")
plt.ylabel("Days to Bloom")
plt.title("Pokémon Bloom Prediction")
plt.legend()
plt.grid(True)
plt.show()

# --- ヒストグラム：価格の分布 / Histogram of price distribution ---
plt.figure()
plt.hist(df['Price_2015'], alpha=0.7, bins=5, label='2015')
plt.hist(df['Price_2016'], alpha=0.7, bins=5, label='2016')
plt.title("Item Price Distribution")
plt.xlabel("Price (yen)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.show()

# --- ライブラリのインポート / Import necessary libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sympy as sp

# ------------------------------
# 【問4】Laspeyres価格指数（アイテム価格）
# ------------------------------

print("▶ Laspeyres価格指数の計算")

item_data = {
    'Item': ['モンスターボール', 'げんきのかけら'],
    'Quantity_2015': [6200, 19865],
    'Price_2015': [200, 1500],
    'Quantity_2016': [6422, 20418],
    'Price_2016': [210, 1440]
}
df = pd.DataFrame(item_data)

numerator = (df['Quantity_2015'] * df['Price_2016']).sum()
denominator = (df['Quantity_2015'] * df['Price_2015']).sum()
laspeyres_index = numerator / denominator * 100

print(f"Laspeyres価格指数（2015年基準）: {laspeyres_index:.2f}\n")

# ------------------------------
# 【問11】価格変化率のプロット（例：モンスターボール）
# ------------------------------

print("▶ モンスターボールの価格指数変化率を描画")

years = np.arange(1970, 2017)
index = np.concatenate([
    np.linspace(30, 85, 20),    # 上昇
    np.ones(10) * 85,           # 横ばい
    np.linspace(85, 105, 17)    # 再上昇
])
diff_rate = np.diff(index) / index[:-1] * 100

plt.figure(figsize=(8, 4))
plt.plot(years[1:], diff_rate, marker='o')
plt.title("Yearly Change in Poké Ball Price Index")
plt.xlabel("Year")
plt.ylabel("Change Rate (%)")
plt.grid(True)
plt.tight_layout()
plt.show()

# ------------------------------
# 【問14】ベイズの定理（不良アイテムの推定）
# ------------------------------

print("▶ ベイズの定理：不良品がA工場産である確率")

P_A = 0.6
P_B = 0.4
P_N_given_A = 0.01
P_N_given_B = 0.05

P_A_given_N = (P_N_given_A * P_A) / (P_N_given_A * P_A + P_N_given_B * P_B)
print(f"不良アイテムがA工場製である確率: {P_A_given_N:.2%}\n")

# ------------------------------
# 【問15・16】確率密度関数（出現時間X）
# ------------------------------

print("▶ ポケモン出現時間の密度関数解析")

x, c = sp.symbols('x c')
fx = c * (2 - x)
integral = sp.integrate(fx, (x, 0, 2))
c_val = sp.solve(integral - 1, c)[0]

# 平均 μ
mu = sp.integrate(x * c_val * (2 - x), (x, 0, 2))

# 分散 σ^2
variance = sp.integrate((x - mu)**2 * c_val * (2 - x), (x, 0, 2))

print(f"正規化定数 c = {c_val}")
print(f"平均（μ）= {mu}")
print(f"分散（σ²）= {variance}")

# --- Import necessary libraries / ライブラリのインポート ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sympy as sp

# =============================================================
# 【問4】Laspeyres価格指数：ポケモンアイテム価格の比較
# =============================================================

print("【問4】Laspeyres Price Index (Pokémon Items)")

# ポケモン版データ（2015年と2016年の2商品：モンスターボール & げんきのかけら）
item_data = {
    'Item': ['モンスターボール', 'げんきのかけら'],
    'Quantity_2015': [6200, 19865],  # 2015年の購入数量 / Quantity in 2015
    'Price_2015': [200, 1500],       # 2015年の価格（円/個） / Price in 2015 (yen/unit)
    'Quantity_2016': [6422, 20418],  # 2016年の購入数量 / Quantity in 2016
    'Price_2016': [210, 1440]        # 2016年の価格（円/個） / Price in 2016 (yen/unit)
}

df = pd.DataFrame(item_data)

# Laspeyres指数 = ( Σ (Quantity_2015 × Price_2016) / Σ (Quantity_2015 × Price_2015) ) × 100
numerator = (df['Quantity_2015'] * df['Price_2016']).sum()
denominator = (df['Quantity_2015'] * df['Price_2015']).sum()
laspeyres_index = numerator / denominator * 100

print(f"Laspeyres価格指数（2015年基準）: {laspeyres_index:.2f}\n")


# =============================================================
# 【問11】価格変化率グラフ：モンスターボールの価格推移
# =============================================================

print("【問11】Plotting the Yearly Change Rate of Poké Ball Price Index")

# 仮のデータ：1970～2016年におけるモンスターボールの価格指数
years = np.arange(1970, 2017)
index = np.concatenate([
    np.linspace(30, 85, 20),    # 1970-1989: 上昇トレンド / Increasing trend
    np.ones(10) * 85,           # 1990-1999: 横ばい / Flat trend
    np.linspace(85, 105, 17)     # 2000-2016: 再上昇 / Rising trend again
])

# 前年比変化率（%）＝ (current_index - previous_index) / previous_index × 100
diff_rate = np.diff(index) / index[:-1] * 100

plt.figure(figsize=(8, 4))
plt.plot(years[1:], diff_rate, marker='o')
plt.title("Yearly Change in Poké Ball Price Index")
plt.xlabel("Year")
plt.ylabel("Change Rate (%)")
plt.grid(True)
plt.tight_layout()
plt.show()


# =============================================================
# 【問14】ベイズの定理：不良品の出所予測（例：キズぐすり）
# =============================================================

print("【問14】Bayes' Theorem: Predicting the Source of Defective Items")

# 例：ポケモンセンターのA工場とB工場で製造されたキズぐすり
P_A = 0.6               # A工場の割合 / Proportion from factory A
P_B = 0.4               # B工場の割合 / Proportion from factory B
P_N_given_A = 0.01      # A工場の不良率 / Defect rate for A
P_N_given_B = 0.05      # B工場の不良率 / Defect rate for B

# ベイズの定理：P(A|Defect) = (P(Defect|A) * P(A)) / (P(Defect|A)*P(A) + P(Defect|B)*P(B))
P_A_given_N = (P_N_given_A * P_A) / (P_N_given_A * P_A + P_N_given_B * P_B)
print(f"不良品がA工場製である確率: {P_A_given_N:.2%}\n")


# =============================================================
# 【問15・16】確率密度関数解析：ポケモン出現までの時間X
# =============================================================

print("【問15・16】Probability Density Function Analysis (Time until a Legendary Pokémon appears)")

# 出現時間Xの密度関数（0 <= x < 2）: f(x) = c(2 - x)
x, c = sp.symbols('x c')
fx = c * (2 - x)

# 正規化条件：積分 ∫_0^2 f(x) dx = 1 を満たす定数 c を求める
integral = sp.integrate(fx, (x, 0, 2))
c_val = sp.solve(integral - 1, c)[0]

# 平均 μ = ∫_0^2 x f(x) dx
mu = sp.integrate(x * c_val * (2 - x), (x, 0, 2))
# 分散 σ² = ∫_0^2 (x - μ)^2 f(x) dx
variance = sp.integrate((x - mu)**2 * c_val * (2 - x), (x, 0, 2))

print(f"正規化定数 c = {c_val}")
print(f"平均 (μ) = {mu}")
print(f"分散 (σ²) = {variance}")

# --- Import required libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sympy as sp
from scipy.stats import norm, binom

# -----------------------------
# 1. Bayes Theorem: Legendary Pokémon location prediction
# -----------------------------
P_A = 0.6  # Probability region A
P_L_given_A = 0.9  # Legendary appears in A
P_L_given_not_A = 0.2  # Legendary appears outside A
P_L = P_L_given_A * P_A + P_L_given_not_A * (1 - P_A)
P_A_given_L = (P_L_given_A * P_A) / P_L

# -----------------------------
# 2. Binomial Distribution: Poké Ball hit probability
# -----------------------------
p_hit = 0.159
x_vals = np.arange(0, 6)
probs = binom.pmf(x_vals, 5, p_hit)
prob_at_least_1 = 1 - probs[0]

# Plot: distribution of hits in 5 throws
plt.figure()
plt.bar(x_vals, probs, color='skyblue')
plt.title("Number of Hits in 5 Throws (Poké Ball)")
plt.xlabel("Hits")
plt.ylabel("Probability")
plt.grid(True)
plt.show()

# -----------------------------
# 3. Normal Distribution: P(Z > 1)
# -----------------------------
z_vals = np.linspace(-3, 3, 300)
pdf_vals = norm.pdf(z_vals)

plt.figure()
plt.plot(z_vals, pdf_vals, label='Standard Normal PDF')
plt.fill_between(z_vals, pdf_vals, where=(z_vals > 1), color='orange', alpha=0.5, label='P(Z > 1)')
plt.title("Standard Normal Distribution")
plt.xlabel("Z")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()

prob_z_greater_than_1 = 1 - norm.cdf(1)

# -----------------------------
# 4. Expected value and variance of Pokémon appearance time
# -----------------------------
x, c = sp.symbols('x c')
fx = c * (2 - x)
integral = sp.integrate(fx, (x, 0, 2))
c_val = sp.solve(integral - 1, c)[0]
mu = sp.integrate(x * c_val * (2 - x), (x, 0, 2))
variance = sp.integrate((x - mu)**2 * c_val * (2 - x), (x, 0, 2))

# Plot: PDF
x_vals_pdf = np.linspace(0, 2, 100)
f_vals_pdf = [float(c_val * (2 - val)) for val in x_vals_pdf]

plt.figure()
plt.plot(x_vals_pdf, f_vals_pdf, color='green')
plt.title("PDF of Pokémon Appearance Time")
plt.xlabel("Time (hours)")
plt.ylabel("Density")
plt.grid(True)
plt.show()

# -----------------------------
# 5. Regression Prediction: Temperature → Appearance day
# -----------------------------
b = -2.7608  # Slope
a = 20.0209  # Intercept
temp_2017 = 6.10
predicted_day = a + b * temp_2017
predicted_day_date = f"April {3 + round(predicted_day)}"  # Assume Mar 31 = Day 0

# Plot: temperature vs. predicted day
temps = np.linspace(3, 10, 50)
days = a + b * temps

plt.figure()
plt.plot(temps, days, label='Regression Line', color='red')
plt.scatter([6.10], [predicted_day], color='blue', label='Prediction (2017)')
plt.title("Temperature vs. Predicted Appearance Day")
plt.xlabel("Temperature (°C)")
plt.ylabel("Predicted Day (days after Mar 31)")
plt.legend()
plt.grid(True)
plt.show()

# -----------------------------
# 6. Autocorrelation (lag-1)
# -----------------------------
prices = [200, 210, 220, 230, 240, 250, 240, 230, 220, 210, 200]
lag1_corr = np.corrcoef(prices[:-1], prices[1:])[0, 1]

plt.figure()
plt.plot(prices, marker='o', label='Item Price')
plt.title("Pokémon Item Price Trend (Example)")
plt.xlabel("Time")
plt.ylabel("Price")
plt.grid(True)
plt.legend()
plt.show()

# -----------------------------
# Display results in a table
# -----------------------------
results = {
    "P(A|Legendary)": round(P_A_given_L, 4),
    "P(Hit ≥ 1 out of 5)": round(prob_at_least_1, 4),
    "P(Z > 1)": round(prob_z_greater_than_1, 4),
    "E[Appearance Time]": float(mu),
    "Var[Appearance Time]": float(variance),
    "Predicted Day (Temp 6.1°C)": predicted_day_date,
    "Lag-1 Autocorrelation": round(lag1_corr, 4)
}

df = pd.DataFrame(results.items(), columns=["Metric", "Value"])
print("▼ Pokémon Statistics Summary")
display(df)

# --- Import libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import binom

# --- Seed for reproducibility ---
np.random.seed(42)

# -------------------------------
# 1. F-test simulation (Legendary success rates)
# -------------------------------
region_A_success = np.random.normal(loc=80, scale=5, size=20)
region_B_success = np.random.normal(loc=75, scale=10, size=10)

mean_A = np.mean(region_A_success)
mean_B = np.mean(region_B_success)
var_A = np.var(region_A_success, ddof=1)
var_B = np.var(region_B_success, ddof=1)
F_statistic = var_A / var_B

# -------------------------------
# 2. Binomial confidence interval (League interest)
# -------------------------------
p_2020 = 0.483
n_2020 = 1897
p_2017 = 0.416
n_2017 = 1925
diff = p_2020 - p_2017
se_diff = np.sqrt(p_2020*(1-p_2020)/n_2020 + p_2017*(1-p_2017)/n_2017)
ci_lower = diff - 1.96 * se_diff
ci_upper = diff + 1.96 * se_diff

# -------------------------------
# 3. Binomial hit distribution (Poké Ball)
# -------------------------------
n_trials = 5
p_hit = 0.159
x_vals = np.arange(0, n_trials + 1)
hit_probs = binom.pmf(x_vals, n_trials, p_hit)

plt.figure()
plt.bar(x_vals, hit_probs, color='skyblue')
plt.title("Poké Ball Hit Distribution (5 Throws)")
plt.xlabel("Number of Hits")
plt.ylabel("Probability")
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# 4. Linear regression (Temperature → Pokémon appearance)
# -------------------------------
temps = np.array([5, 6, 7, 8, 9])
days = 20 - 2.5 * temps + np.random.normal(0, 1, size=len(temps))
coef = np.polyfit(temps, days, 1)
pred_day = np.polyval(coef, 6.1)

plt.figure()
plt.scatter(temps, days, color='orange', label='Observed')
plt.plot(temps, np.polyval(coef, temps), color='red', label='Regression Line')
plt.scatter([6.1], [pred_day], color='green', label='Prediction (6.1°C)')
plt.title("Temperature vs. Pokémon Appearance Day")
plt.xlabel("Temperature (°C)")
plt.ylabel("Appearance Day (Days After Mar 31)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# Summary Table
# -------------------------------
summary = {
    "Mean (Region A)": round(mean_A, 2),
    "Variance (Region A)": round(var_A, 2),
    "Mean (Region B)": round(mean_B, 2),
    "Variance (Region B)": round(var_B, 2),
    "F-statistic (A/B)": round(F_statistic, 3),
    "CI of Interest Difference": f"[{ci_lower:.3f}, {ci_upper:.3f}]",
    "Predicted Appearance Day at 6.1°C": round(pred_day, 2)
}

# Display as DataFrame
df_summary = pd.DataFrame(summary.items(), columns=["Metric", "Value"])
print("▼ Pokémon Themed Statistical Summary")
display(df_summary)

# --- Import necessary libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2, chisquare, f_oneway

# --- 1. Goodness-of-fit test: Pokémon lottery results ---
# Categories: 1st prize, 2nd prize, fail
observed = np.array([5, 12, 33])
expected = np.array([10, 15, 25])
chi_stat = ((observed - expected) ** 2 / expected).sum()
df_chi = len(observed) - 1
critical_value = chi2.ppf(0.95, df=df_chi)
print(f"[Goodness-of-fit Test]")
print(f"Chi-squared statistic: {chi_stat:.2f}")
print(f"Critical value (df={df_chi}): {critical_value:.2f}")
if chi_stat > critical_value:
    print("→ Reject H₀: Observed Pokémon lottery results significantly differ from expected.")
else:
    print("→ Fail to reject H₀: No significant difference in Pokémon lottery results.\n")

# --- 2. ANOVA: CP across Pokémon regions ---
np.random.seed(42)  # reproducibility
regions = {
    'Kanto': np.random.normal(210, 20, 27),
    'Johto': np.random.normal(89, 15, 13),
    'Hoenn': np.random.normal(712, 30, 3),
    'Sinnoh': np.random.normal(515, 25, 31),
    'Unova': np.random.normal(192, 10, 7),
    'Kalos': np.random.normal(559, 40, 3),
}
anova_result = f_oneway(*regions.values())
print("[ANOVA]")
print(f"F-statistic: {anova_result.statistic:.2f}")
print(f"P-value: {anova_result.pvalue:.4e}")
if anova_result.pvalue < 0.05:
    print("→ Reject H₀: There are significant differences in CP among regions.")
else:
    print("→ Fail to reject H₀: No significant CP difference among regions.\n")

# --- 3. Visualization: Histogram of CPs by Region ---
plt.figure(figsize=(10, 6))
for name, cp in regions.items():
    plt.hist(cp, bins=10, alpha=0.6, label=name)
plt.title("Combat Power (CP) Distribution by Pokémon Region")
plt.xlabel("CP")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Import libraries ---
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm, binomtest, ttest_ind

np.random.seed(42)

# -------------------------------
# 1. Confidence Interval (Proportion)
# -------------------------------
p_hat = 0.483  # sample proportion
n = 1897
se = np.sqrt(p_hat * (1 - p_hat) / n)
ci_lower, ci_upper = p_hat - 1.96 * se, p_hat + 1.96 * se

# Plot: Confidence Interval
plt.figure(figsize=(6, 1.5))
plt.hlines(1, ci_lower, ci_upper, colors='blue', linewidth=6, label='95% CI')
plt.plot(p_hat, 1, 'ro', label='Sample Proportion')
plt.title('League Interest Proportion (95% CI)')
plt.yticks([])
plt.xlabel("Proportion")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# 2. Regression Analysis: Temperature vs. Appearance Day
# -------------------------------
temps = np.array([5, 6, 7, 8, 9])
days = 20 - 2.5 * temps + np.random.normal(0, 1, len(temps))
slope, intercept = np.polyfit(temps, days, 1)
pred_day = slope * 6.1 + intercept

# Plot: Regression
plt.figure(figsize=(6, 4))
plt.scatter(temps, days, color='orange', label='Observed')
plt.plot(temps, slope * temps + intercept, color='red', label='Regression Line')
plt.scatter([6.1], [pred_day], color='green', label='Prediction (6.1°C)')
plt.title("Pokémon Appearance Day vs. Temperature")
plt.xlabel("Temperature (°C)")
plt.ylabel("Days After Mar 31")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# 3. Binomial Test: Pikachu's Thunderbolt Accuracy
# -------------------------------
successes = 18
trials = 30
binom_result = binomtest(k=successes, n=trials, p=0.5, alternative='two-sided')
p_value_binom = binom_result.pvalue

# Plot: Binomial Distribution (Normal Approximation)
x_vals = np.arange(0, trials + 1)
approx_probs = norm.pdf(x_vals, loc=trials * 0.5, scale=np.sqrt(trials * 0.5 * 0.5))
plt.figure(figsize=(6, 4))
plt.bar(x_vals, approx_probs, color='skyblue', label='Expected under H₀')
plt.axvline(successes, color='red', linestyle='--', label='Observed Success')
plt.title("Pikachu's Thunderbolt Success (Binomial Test)")
plt.xlabel("Number of Successes")
plt.ylabel("Probability (Normal Approx.)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# 4. t-Test: Charmander vs. Squirtle Speed
# -------------------------------
speeds_charmander = np.random.normal(65, 10, 30)
speeds_squirtle = np.random.normal(58, 10, 30)
t_stat, t_pval = ttest_ind(speeds_charmander, speeds_squirtle)

# Plot: Boxplot
plt.figure(figsize=(6, 4))
plt.boxplot([speeds_charmander, speeds_squirtle], labels=['Charmander', 'Squirtle'])
plt.title("Speed Comparison: Charmander vs. Squirtle (t-test)")
plt.ylabel("Speed")
plt.grid(True)
plt.tight_layout()
plt.show()

# -------------------------------
# Summary Table
# -------------------------------
summary = {
    "95% CI for League Interest": f"[{ci_lower:.3f}, {ci_upper:.3f}]",
    "Predicted Day (6.1°C)": round(pred_day, 2),
    "Binomial Test p-value (Thunderbolt)": round(p_value_binom, 4),
    "T-test Statistic (Speed)": round(t_stat, 2),
    "T-test p-value (Speed)": round(t_pval, 4)
}
df_summary = pd.DataFrame(summary.items(), columns=["Analysis", "Result"])
print("▼ Pokémon Statistical Summary")
display(df_summary)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- Dummy data: Pokémon items and regions (instead of prefectures) ---
# Let's say we have five regions and retention rates for five types of devices/items
regions = ['Kanto', 'Johto', 'Hoenn', 'Sinnoh', 'Unova', 'Kalos', 'Alola', 'Galar', 'Paldea']
np.random.seed(42)

# Retention rates for items: Pokégear (PG), Pokétch (PT), PokéNav (PN), TV, PokéDisk (PD)
# Values are retention percentages per region
data = {
    'Region': regions,
    'Pokégear (PG)': np.random.normal(60, 10, len(regions)),    # Like MP
    'Pokétch (PT)': np.random.normal(80, 5, len(regions)),      # Like SP
    'PokéNav (PN)': np.random.normal(50, 8, len(regions)),      # Like PC
    'TV': np.random.normal(95, 2, len(regions)),                # TV high retention
    'PokéDisk (PD)': np.random.normal(70, 6, len(regions))      # DVD/BD equivalent
}

df = pd.DataFrame(data)

# --- Boxplot for all items ---
plt.figure(figsize=(10, 6))
df.drop(columns='Region').boxplot()
plt.title('Pokémon Item Retention Rate by Region')
plt.ylabel('Retention Rate (%)')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Histogram comparison: Pokégear (PG) vs Pokétch (PT) ---
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
axes[0].hist(df['Pokégear (PG)'], bins=5, color='skyblue', edgecolor='black')
axes[0].set_title("Pokégear (PG) Retention Rate")
axes[0].set_xlabel("Retention (%)")
axes[0].set_ylabel("Frequency")
axes[0].grid(True)

axes[1].hist(df['Pokétch (PT)'], bins=5, color='salmon', edgecolor='black')
axes[1].set_title("Pokétch (PT) Retention Rate")
axes[1].set_xlabel("Retention (%)")
axes[1].grid(True)

plt.tight_layout()
plt.show()
from scipy.stats import ttest_ind, pearsonr, linregress

# --- t検定: Pokétch (PT) vs Pokégear (PG) ---
t_stat, t_pval = ttest_ind(df['Pokétch (PT)'], df['Pokégear (PG)'])

# --- 相関係数: PokéNav vs PokéDisk ---
corr_coef, corr_pval = pearsonr(df['PokéNav (PN)'], df['PokéDisk (PD)'])

# --- 単回帰分析: PokéNavを説明変数、PokéDiskを目的変数 ---
slope, intercept, r_value, p_value, std_err = linregress(df['PokéNav (PN)'], df['PokéDisk (PD)'])
predicted_pd = intercept + slope * df['PokéNav (PN)']

# --- 回帰の可視化 ---
plt.figure(figsize=(7, 5))
plt.scatter(df['PokéNav (PN)'], df['PokéDisk (PD)'], label='Data', color='purple')
plt.plot(df['PokéNav (PN)'], predicted_pd, color='black', linestyle='--', label='Regression Line')
plt.xlabel("PokéNav Retention (%)")
plt.ylabel("PokéDisk Retention (%)")
plt.title("Regression: PokéNav vs PokéDisk")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# --- 結果出力 ---
results = {
    't-test (PT vs PG)': f"t = {t_stat:.3f}, p = {t_pval:.4f}",
    'Correlation (PN vs PD)': f"r = {corr_coef:.3f}, p = {corr_pval:.4f}",
    'Regression (PN → PD)': f"y = {intercept:.2f} + {slope:.2f}x, R² = {r_value**2:.3f}"
}

results_df = pd.DataFrame(results.items(), columns=['Analysis', 'Result'])
results_df

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# --- データ準備（Regionを除く） ---
X = df.drop(columns=['Region'])

# --- 多変量線形回帰 ---
# PokéDisk (PD) を目的変数、その他を説明変数
X_reg = X.drop(columns='PokéDisk (PD)')
y_reg = X['PokéDisk (PD)']

reg_model = LinearRegression()
reg_model.fit(X_reg, y_reg)
reg_coeffs = reg_model.coef_
reg_intercept = reg_model.intercept_
reg_r2 = reg_model.score(X_reg, y_reg)

# --- 主成分分析（PCA） ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
explained_variance = pca.explained_variance_ratio_

# --- クラスタリング（KMeans） ---
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# --- PCA scatter plot with clusters ---
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', s=100)
for i, txt in enumerate(df['Region']):
    plt.annotate(txt, (X_pca[i, 0] + 0.1, X_pca[i, 1] + 0.1), fontsize=8)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Pokémon Item Retention with Clustering')
plt.grid(True)
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.show()

# --- 結果出力 ---
results_multi = {
    'Multivariate Regression R²': reg_r2,
    'Regression Coefficients': dict(zip(X_reg.columns, reg_coeffs)),
    'Regression Intercept': reg_intercept,
    'PCA Explained Variance': explained_variance.tolist()
}

results_df = pd.DataFrame.from_dict(results_multi['Regression Coefficients'], orient='index', columns=['Coefficient'])
results_df.loc['(Intercept)'] = reg_intercept
results_df.loc['R²'] = reg_r2
results_df

# --- Import necessary libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

# --- Dummy data for Pokémon analysis: Capture count vs Market price ---
np.random.seed(42)
capture = np.array([100, 110, 120, 130, 140, 150])
price = np.array([16.5, 15.8, 15.2, 14.1, 13.4, 12.9])

# --- Create DataFrame ---
df = pd.DataFrame({
    'CaptureCount': capture,
    'MarketPrice': price
})

# --- Add constant term for intercept in regression ---
X = sm.add_constant(df['CaptureCount'])
y = df['MarketPrice']

# --- Fit regression model ---
model = sm.OLS(y, X).fit()

# --- Predict values for regression line ---
df['PredictedPrice'] = model.predict(X)

# --- Plot the regression ---
plt.figure(figsize=(8, 5))
plt.scatter(df['CaptureCount'], df['MarketPrice'], label='Actual Data', color='blue')
plt.plot(df['CaptureCount'], df['PredictedPrice'], label='Regression Line', color='red')
plt.xlabel('Number of Captures')
plt.ylabel('Market Price (Pokémon Coins)')
plt.title('Pokémon Regression: Capture Count vs Market Price')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Return regression summary ---
model.summary()

# --- 必要なライブラリ ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# --- ダミーデータ生成 / Generate dummy Pokémon data ---
np.random.seed(42)

n = 50
df = pd.DataFrame({
    'Name': [f'Pokemon_{i}' for i in range(n)],
    'Attack': np.random.normal(75, 15, n),
    'Defense': np.random.normal(70, 20, n),
    'Speed': np.random.normal(80, 10, n),
    'Popularity': np.random.normal(50, 15, n),  # 人気度
    'Price': np.random.normal(1000, 200, n)     # 価格（仮）
})

# --- 1. 相関係数と散布図 / Correlation matrix & scatter plot ---
sns.pairplot(df[['Attack', 'Defense', 'Speed', 'Popularity', 'Price']])
plt.suptitle("Pokémon Attribute Pair Plots", y=1.02)
plt.show()

corr = df[['Attack', 'Defense', 'Speed', 'Popularity', 'Price']].corr()
print("Correlation Matrix:\n", corr)

# --- 2. t検定: 高速ポケモン vs 低速ポケモンの人気度差 / T-test ---
median_speed = df['Speed'].median()
fast = df[df['Speed'] > median_speed]['Popularity']
slow = df[df['Speed'] <= median_speed]['Popularity']

t_stat, p_value = stats.ttest_ind(fast, slow, equal_var=False)
print(f"\nT-test result (Popularity fast vs slow): t = {t_stat:.3f}, p = {p_value:.3f}")

# --- 3. 単回帰分析（人気度→価格） / Simple regression ---
X = df[['Popularity']]
y = df['Price']
reg = LinearRegression().fit(X, y)

# 回帰式の表示
print(f"\nRegression: Price = {reg.intercept_:.2f} + {reg.coef_[0]:.2f} × Popularity")

# 可視化
plt.scatter(df['Popularity'], df['Price'], label="Data")
plt.plot(df['Popularity'], reg.predict(X), color='red', label="Regression Line")
plt.xlabel("Popularity")
plt.ylabel("Price")
plt.title("Simple Linear Regression: Price ~ Popularity")
plt.legend()
plt.show()

# --- 4. 多変量回帰分析（Attack, Defense, Speed）→ Price ---
X_multi = df[['Attack', 'Defense', 'Speed']]
y = df['Price']
reg_multi = LinearRegression().fit(X_multi, y)
print("\nMultiple Regression Coefficients:")
for feature, coef in zip(X_multi.columns, reg_multi.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {reg_multi.intercept_:.2f}")

# --- 5. クラスタリング（2クラスタ） / KMeans clustering ---
kmeans = KMeans(n_clusters=2, random_state=0)
df['Cluster'] = kmeans.fit_predict(df[['Attack', 'Defense', 'Speed']])

# クラスタ可視化
sns.scatterplot(data=df, x='Attack', y='Defense', hue='Cluster', palette='Set1')
plt.title("Pokémon Clustering (Attack vs Defense)")
plt.show()

# --- 6. 主成分分析（PCA） / Principal Component Analysis ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df[['Attack', 'Defense', 'Speed']])
df['PC1'], df['PC2'] = X_pca[:, 0], X_pca[:, 1]

# 可視化
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set2')
plt.title("PCA of Pokémon Stats")
plt.show()

# --- Import ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# --- ダミーデータ：ポケモンの戦闘力に関連するデータ（仮） ---
np.random.seed(42)
n = 100
df = pd.DataFrame({
    'Attack': np.random.normal(70, 15, n),  # 攻撃力
    'Defense': np.random.normal(60, 10, n),  # 防御力
    'Speed': np.random.normal(65, 12, n),  # 素早さ
    'WinRate': np.random.normal(50, 10, n)  # 勝率（％）
})

# --- 単回帰分析: 攻撃力が勝率に与える影響 ---
slope, intercept, r_value, p_value, std_err = stats.linregress(df['Attack'], df['WinRate'])
print("Regression Coefficients")
print(f"WinRate = {intercept:.2f} + {slope:.2f} * Attack")
print(f"R-squared: {r_value**2:.3f}, p-value: {p_value:.4f}")

# --- 回帰直線の描画 ---
plt.figure(figsize=(8, 6))
sns.regplot(x='Attack', y='WinRate', data=df, line_kws={"color": "red"})
plt.title('Attack vs Win Rate (Linear Regression)')
plt.xlabel('Attack')
plt.ylabel('Win Rate (%)')
plt.grid(True)
plt.show()

# --- t検定: Speedの平均値が65かどうか（帰無仮説: μ = 65） ---
t_stat, p_val = stats.ttest_1samp(df['Speed'], popmean=65)
print("\nT-test for Speed mean = 65")
print(f"t-statistic = {t_stat:.3f}, p-value = {p_val:.4f}")

# --- 相関係数：防御と勝率の相関 ---
corr, p_corr = stats.pearsonr(df['Defense'], df['WinRate'])
print("\nCorrelation between Defense and WinRate")
print(f"Pearson correlation = {corr:.3f}, p-value = {p_corr:.4f}")

# --- 可視化：散布図行列 ---
sns.pairplot(df)
plt.suptitle("Scatter Matrix for Pokémon Stats", y=1.02)
plt.show()

# --- Import required libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# --- Dummy data: Pokémon stats and sales score ---
data = {
    'Name': ['Pikachu', 'Charizard', 'Blastoise', 'Venusaur', 'Gengar', 'Dragonite', 'Snorlax', 'Alakazam', 'Machamp', 'Gyarados'],
    'Attack': [55, 84, 83, 82, 65, 134, 110, 50, 130, 125],
    'Defense': [40, 78, 100, 83, 60, 95, 65, 45, 80, 79],
    'Speed': [90, 100, 78, 80, 110, 80, 30, 120, 55, 81],
    'SalesScore': [300, 500, 400, 420, 380, 600, 450, 390, 410, 580]
}

df = pd.DataFrame(data)

# --- Standardize the features for clustering and PCA ---
features = ['Attack', 'Defense', 'Speed']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Multivariate Linear Regression ---
reg_model = LinearRegression()
reg_model.fit(X, df['SalesScore'])

# --- Print regression coefficients ---
print("Regression Coefficients:")
for feature, coef in zip(features, reg_model.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept: {reg_model.intercept_:.2f}")
print(f"R^2 Score: {reg_model.score(X, df['SalesScore']):.3f}")

# --- KMeans Clustering ---
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)
df['Cluster'] = kmeans.labels_

# --- PCA for visualization ---
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
df[['PC1', 'PC2']] = principal_components

# --- Plot PCA result with cluster labels ---
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', style='Name', palette='viridis', s=100)
plt.title("PCA of Pokémon Stats with Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# --- ライブラリのインポート ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, binomtest, pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# --- ダミーデータの作成（ポケモンアイテム売上と価格） ---
np.random.seed(42)
data = {
    "Item": ["モンスターボール", "スーパーボール", "ハイパーボール", "キズぐすり", "すごいキズぐすり", "わざマシン"],
    "Price": [200, 600, 1200, 300, 700, 4000],
    "Sales_Region_A": np.random.randint(100, 300, 6),
    "Sales_Region_B": np.random.randint(100, 300, 6),
}
df = pd.DataFrame(data)

# --- 信頼区間（差の検定） ---
stat, pval = ttest_ind(df["Sales_Region_A"], df["Sales_Region_B"])
print(f"t検定: p値 = {pval:.4f}")

# --- 相関係数と散布図 ---
corr, _ = pearsonr(df["Price"], df["Sales_Region_A"])
print(f"Price と Sales_Region_A の相関係数: {corr:.2f}")

sns.regplot(x="Price", y="Sales_Region_A", data=df)
plt.title("Price vs Sales (Region A)")
plt.show()

# --- 単回帰分析 ---
X = df[["Price"]]
y = df["Sales_Region_A"]
model = LinearRegression().fit(X, y)
print(f"回帰係数: {model.coef_[0]:.2f}, 切片: {model.intercept_:.2f}")

# --- 多変量回帰分析 ---
X_multi = df[["Price", "Sales_Region_B"]]
model_multi = LinearRegression().fit(X_multi, y)
print("多変量回帰係数:", model_multi.coef_)

# --- クラスタリング ---
X_cluster = df[["Price", "Sales_Region_A", "Sales_Region_B"]]
kmeans = KMeans(n_clusters=2, n_init=10).fit(X_cluster)
df["Cluster"] = kmeans.labels_

# --- PCA可視化 ---
pca = PCA(n_components=2)
components = pca.fit_transform(X_cluster)
df_pca = pd.DataFrame(components, columns=["PC1", "PC2"])
df_pca["Cluster"] = df["Cluster"]
sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=df_pca)
plt.title("PCA of Pokémon Item Stats")
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up