世界史日本史のための統計回帰数学(日記)

Python
Posted at 2025-04-27
# Program Name: regression_historical_data_demo.py
# 概要 / Purpose:
#   日本史と世界史に関するデータを用いて、各回帰手法を体験するデモプログラム。
#   Demonstration script using Japanese and world historical data to explore various regression methods.

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats  # for statistical tests / 統計検定用
from sklearn.linear_model import LinearRegression  # for linear regression / 線形回帰モデル
from sklearn.model_selection import cross_val_score  # for cross-validation / 交差検証
import sympy as sp  # for symbolic math / シンボリック数学

# --- 1. Data Generation & Scatter Plot ---
# 日本の人口推移データ / Japanese population over time
# Year and population (in millions)
years_jp = np.array([1872, 1900, 1920, 1950, 2000, 2020])  # 国勢調査年 / Census years
pop_jp = np.array([33.3, 43.9, 55.9, 83.2, 126.9, 125.8])     # 人口（百万） / Population (millions)

# Correlation coefficient calculation / 相関係数の計算
corr_jp = np.corrcoef(years_jp, pop_jp)[0,1]
print(f"Correlation (Year vs Japan Pop): {corr_jp:.3f}")  # 出力: 相関係数 / correlation

# Scatter plot / 散布図の描画
plt.scatter(years_jp, pop_jp)
plt.title("Japanese Population over Time")  # English only (plot title)
plt.xlabel("Year")                                # X-axis label
plt.ylabel("Population (millions)")               # Y-axis label
plt.show()

# --- 2. Least Squares / 最小二乗法 ---
# Compute mean values / 平均値の計算
mean_year = years_jp.mean()
mean_pop = pop_jp.mean()
# Calculate slope (m) and intercept (c) / 勾配と切片の計算
m_ls = np.sum((years_jp - mean_year) * (pop_jp - mean_pop)) / np.sum((years_jp - mean_year)**2)
c_ls = mean_pop - m_ls * mean_year
print(f"OLS slope m: {m_ls:.6f}, intercept c: {c_ls:.3f}")  # 出力: 勾配と切片

# Predicted values and plot / 予測値とプロット
pop_pred_ls = m_ls * years_jp + c_ls
plt.scatter(years_jp, pop_jp)
plt.plot(years_jp, pop_pred_ls, label='OLS Fit')
plt.title("OLS Fit: Year vs Population")
plt.xlabel("Year")
plt.ylabel("Population (millions)")
plt.legend()
plt.show()

# --- 3. R^2 Evaluation / 決定係数の評価 ---
ss_res = np.sum((pop_jp - pop_pred_ls)**2)  # 残差平方和 / sum of squared residuals
ss_tot = np.sum((pop_jp - mean_pop)**2)    # 全変動 / total sum of squares
r2 = 1 - ss_res / ss_tot
print(f"R^2: {r2:.3f}")  # 出力: 決定係数

# --- 4. Polynomial Regression / 多項式回帰 (世界人口推移) ---
# World population data / 世界人口（十億単位）
years_world = np.array([1800, 1900, 1950, 2000, 2020])  # 年
pop_world = np.array([1.0, 1.6, 2.5, 6.1, 7.8])         # 人口（十億）/ population (billions)
# Design matrix for quadratic fit / 2次多項式用デザイン行列
design = np.vstack([np.ones_like(years_world), years_world, years_world**2]).T
theta_poly = np.linalg.pinv(design.T @ design) @ design.T @ pop_world
print(f"Poly coeffs (world pop): {theta_poly}")  # 出力: 多項式係数

# Plot polynomial regression / 多項式回帰のプロット
year_range = np.linspace(1800, 2020, 200)
design_new = np.vstack([np.ones_like(year_range), year_range, year_range**2]).T
pop_poly = design_new @ theta_poly
plt.scatter(years_world, pop_world)
plt.plot(year_range, pop_poly, label='Poly deg 2')
plt.title("Polynomial Regression on World Population")
plt.xlabel("Year")
plt.ylabel("Population (billions)")
plt.legend()
plt.show()

# --- 5. Multiple Regression / 多変量回帰 (Year, Pop → GDP of Japan) ---
# GDP data for Japan (in trillion USD) / 日本のGDP（兆USD）
gdp_jp = np.array([0.002, 0.013, 0.034, 4.9, 5.1, 5.0])
# Construct feature matrix / 特徴量行列の作成
X_multi = np.vstack([np.ones_like(years_jp), years_jp, pop_jp]).T
theta_multi = np.linalg.pinv(X_multi.T @ X_multi) @ X_multi.T @ gdp_jp
print(f"Multi-regression coeffs (GDP): {theta_multi}")  # 出力: 回帰係数

# --- 6. Gradient Descent / 勾配降下法 (Demo on Japan Pop) ---
m_gd, c_gd = 0.0, 0.0
learning_rate = 1e-6  # 学習率 / step size
for _ in range(50000):
    pred = m_gd * years_jp + c_gd
    dm = -(2 / len(years_jp)) * np.sum(years_jp * (pop_jp - pred))
    dc = -(2 / len(years_jp)) * np.sum(pop_jp - pred)
    m_gd -= learning_rate * dm
    c_gd -= learning_rate * dc
print(f"GD result m: {m_gd:.6f}, c: {c_gd:.3f}")  # 出力: 勾配降下法結果

# --- 7. Newton–Raphson Reciprocal / ニュートン法逆数近似 ---
g = 1 / 645.0  # Approximate reciprocal of 645 (World War I casualties, thousands) / 第一次世界大戦犠牲者数（千人）の逆数近似
for _ in range(3):
    g = g * (2 - 645 * g)
print(f"Reciprocal approx of 645: {g:.6e}")  # 出力: 逆数近似

# --- 8. Sympy Derivative & Taylor / 微分 & テイラー ---
x_s, a_s = sp.symbols('x_s a_s')  # symbols definition / シンボル定義
expr = a_s * x_s  # Example: P = a * x / 例式
deriv = sp.diff(expr, x_s)  # derivative / 微分
taylor3 = sp.series(sp.sin(x_s), x_s, 0, 5)  # Taylor series for sin(x) up to x^4
print(f"Symbolic derivative of a*x: {deriv}")  # 出力: 微分結果
print(f"Taylor series of sin(x): {taylor3}")  # 出力: テイラー展開

# --- 9. Trigonometric Regression / トライゴンメトリック回帰 (周期的歴史イベント) ---
# Synthetic periodic event data (e.g., solar cycles) / 太陽活動周期データの合成
np.random.seed(0)
x_trig = np.linspace(0, 11, 100)  # years within one solar cycle (~11 years)
noise_trig = np.random.randn(100) * 0.2
# Simulate cycle amplitude
y_trig = 50 + 40 * np.sin(2 * np.pi * x_trig / 11) + noise_trig
# Feature transform / 特徴量変換
design_trig = np.vstack([np.sin(2 * np.pi * x_trig / 11), np.cos(2 * np.pi * x_trig / 11)]).T
theta_trig = np.linalg.pinv(design_trig.T @ design_trig) @ design_trig.T @ y_trig
A_trig, B_trig = theta_trig
print(f"Trig regression coeffs A: {A_trig:.3f}, B: {B_trig:.3f}")
# Plot trig regression / プロット
plt.scatter(x_trig, y_trig, label='Data')
plt.plot(x_trig, A_trig * np.sin(2 * np.pi * x_trig / 11) + B_trig * np.cos(2 * np.pi * x_trig / 11), label='Trig Fit')
plt.title("Trigonometric Regression on Solar Cycle")
plt.xlabel("Year in Cycle")
plt.ylabel("Activity Level")
plt.legend()
plt.show()

# --- 10. Exponential Regression / 指数関数回帰 (世界GDP成長) ---
# Synthetic world GDP growth data / 世界GDP成長データの合成
np.random.seed(1)
x_exp = np.array([1960, 1970, 1980, 1990, 2000, 2010, 2020])
gdp_world = np.array([1.0, 2.0, 3.5, 6.0, 10.0, 18.0, 25.0]) + np.random.randn(7) * 1.0
# Fit exponential y = a * exp(b*x)
Y_log = np.log(gdp_world)
X_exp = np.vstack([np.ones_like(x_exp), x_exp]).T
beta_exp = np.linalg.pinv(X_exp.T @ X_exp) @ X_exp.T @ Y_log
a_exp = np.exp(beta_exp[0])
b_exp = beta_exp[1]
print(f"Exp regression a: {a_exp:.3f}, b: {b_exp:.6f}")
# Plot exponential fit
x_fit = np.linspace(1960, 2020, 100)
plt.scatter(x_exp, gdp_world, label='Data')
plt.plot(x_fit, a_exp * np.exp(b_exp * x_fit), label='Exp Fit')
plt.title("Exponential Regression on World GDP")
plt.xlabel("Year")
plt.ylabel("GDP (trillion USD)")
plt.legend()
plt.show()

# --- 11. Logarithmic Regression / 対数関数回帰 (識字率 vs GDP per capita) ---
# Synthetic literacy vs GDP per capita data / 識字率と一人当たりGDPのデータ合成
np.random.seed(2)
x_log = np.array([500, 1000, 2000, 5000, 10000, 20000])  # GDP per capita
literacy = 40 + 50 * np.log(x_log) / np.log(20000) + np.random.randn(6) * 2
# Fit logarithmic y = a + b * ln(x)
Phi_log = np.vstack([np.ones_like(x_log), np.log(x_log)]).T
beta_log = np.linalg.pinv(Phi_log.T @ Phi_log) @ Phi_log.T @ literacy
a_log, b_log = beta_log
print(f"Log regression a: {a_log:.3f}, b: {b_log:.3f}")
# Plot logarithmic fit
x_plot = np.linspace(500, 20000, 100)
plt.scatter(x_log, literacy, label='Data')
plt.plot(x_plot, a_log + b_log * np.log(x_plot), label='Log Fit')
plt.title("Logarithmic Regression: Literacy vs GDP per Capita")
plt.xlabel("GDP per Capita (USD)")
plt.ylabel("Literacy Rate (%)")
plt.legend()
plt.show()

# --- 12. t-Test for Coefficients / t検定 ---
# Reuse OLS model from section 2
t_model = LinearRegression().fit(years_jp.reshape(-1,1), pop_jp)
beta = t_model.coef_[0]
se = np.sqrt(ss_res / (len(years_jp) - 2) / np.sum((years_jp - mean_year)**2))  # 標準誤差

t_stat = beta / se
p_val = stats.t.sf(abs(t_stat), df=len(years_jp)-2) * 2
print(f"t-stat: {t_stat:.3f}, p-value: {p_val:.3f}")

# --- 13. Cross-Validation / 交差検証 ---
cvs = cross_val_score(LinearRegression(), years_jp.reshape(-1,1), pop_jp, cv=3)
print(f"Cross-validation R² scores: {cvs}")

# --- 14. Numeric Integration of Residual CDF / 残差CDF数値積分 ---
residuals = pop_jp - t_model.predict(years_jp.reshape(-1,1))
x_vals = np.sort(residuals)
cdf_vals = np.arange(1, len(residuals)+1) / len(residuals)
risk = np.trapz(cdf_vals, x_vals)
print(f"Integrated residual CDF (risk metric): {risk:.3f}")
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up