0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

世界史日本史のための統計回帰数学(日記)

Posted at
# Program Name: regression_historical_data_demo.py
# 概要 / Purpose:
#   日本史と世界史に関するデータを用いて、各回帰手法を体験するデモプログラム。
#   Demonstration script using Japanese and world historical data to explore various regression methods.

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats  # for statistical tests / 統計検定用
from sklearn.linear_model import LinearRegression  # for linear regression / 線形回帰モデル
from sklearn.model_selection import cross_val_score  # for cross-validation / 交差検証
import sympy as sp  # for symbolic math / シンボリック数学

# --- 1. Data Generation & Scatter Plot ---
# 日本の人口推移データ / Japanese population over time
# Year and population (in millions)
years_jp = np.array([1872, 1900, 1920, 1950, 2000, 2020])  # 国勢調査年 / Census years
pop_jp = np.array([33.3, 43.9, 55.9, 83.2, 126.9, 125.8])     # 人口(百万) / Population (millions)

# Correlation coefficient calculation / 相関係数の計算
corr_jp = np.corrcoef(years_jp, pop_jp)[0,1]
print(f"Correlation (Year vs Japan Pop): {corr_jp:.3f}")  # 出力: 相関係数 / correlation

# Scatter plot / 散布図の描画
plt.scatter(years_jp, pop_jp)
plt.title("Japanese Population over Time")  # English only (plot title)
plt.xlabel("Year")                                # X-axis label
plt.ylabel("Population (millions)")               # Y-axis label
plt.show()

# --- 2. Least Squares / 最小二乗法 ---
# Compute mean values / 平均値の計算
mean_year = years_jp.mean()
mean_pop = pop_jp.mean()
# Calculate slope (m) and intercept (c) / 勾配と切片の計算
m_ls = np.sum((years_jp - mean_year) * (pop_jp - mean_pop)) / np.sum((years_jp - mean_year)**2)
c_ls = mean_pop - m_ls * mean_year
print(f"OLS slope m: {m_ls:.6f}, intercept c: {c_ls:.3f}")  # 出力: 勾配と切片

# Predicted values and plot / 予測値とプロット
pop_pred_ls = m_ls * years_jp + c_ls
plt.scatter(years_jp, pop_jp)
plt.plot(years_jp, pop_pred_ls, label='OLS Fit')
plt.title("OLS Fit: Year vs Population")
plt.xlabel("Year")
plt.ylabel("Population (millions)")
plt.legend()
plt.show()

# --- 3. R^2 Evaluation / 決定係数の評価 ---
ss_res = np.sum((pop_jp - pop_pred_ls)**2)  # 残差平方和 / sum of squared residuals
ss_tot = np.sum((pop_jp - mean_pop)**2)    # 全変動 / total sum of squares
r2 = 1 - ss_res / ss_tot
print(f"R^2: {r2:.3f}")  # 出力: 決定係数

# --- 4. Polynomial Regression / 多項式回帰 (世界人口推移) ---
# World population data / 世界人口(十億単位)
years_world = np.array([1800, 1900, 1950, 2000, 2020])  # 年
pop_world = np.array([1.0, 1.6, 2.5, 6.1, 7.8])         # 人口(十億)/ population (billions)
# Design matrix for quadratic fit / 2次多項式用デザイン行列
design = np.vstack([np.ones_like(years_world), years_world, years_world**2]).T
theta_poly = np.linalg.pinv(design.T @ design) @ design.T @ pop_world
print(f"Poly coeffs (world pop): {theta_poly}")  # 出力: 多項式係数

# Plot polynomial regression / 多項式回帰のプロット
year_range = np.linspace(1800, 2020, 200)
design_new = np.vstack([np.ones_like(year_range), year_range, year_range**2]).T
pop_poly = design_new @ theta_poly
plt.scatter(years_world, pop_world)
plt.plot(year_range, pop_poly, label='Poly deg 2')
plt.title("Polynomial Regression on World Population")
plt.xlabel("Year")
plt.ylabel("Population (billions)")
plt.legend()
plt.show()

# --- 5. Multiple Regression / 多変量回帰 (Year, Pop → GDP of Japan) ---
# GDP data for Japan (in trillion USD) / 日本のGDP(兆USD)
gdp_jp = np.array([0.002, 0.013, 0.034, 4.9, 5.1, 5.0])
# Construct feature matrix / 特徴量行列の作成
X_multi = np.vstack([np.ones_like(years_jp), years_jp, pop_jp]).T
theta_multi = np.linalg.pinv(X_multi.T @ X_multi) @ X_multi.T @ gdp_jp
print(f"Multi-regression coeffs (GDP): {theta_multi}")  # 出力: 回帰係数

# --- 6. Gradient Descent / 勾配降下法 (Demo on Japan Pop) ---
m_gd, c_gd = 0.0, 0.0
learning_rate = 1e-6  # 学習率 / step size
for _ in range(50000):
    pred = m_gd * years_jp + c_gd
    dm = -(2 / len(years_jp)) * np.sum(years_jp * (pop_jp - pred))
    dc = -(2 / len(years_jp)) * np.sum(pop_jp - pred)
    m_gd -= learning_rate * dm
    c_gd -= learning_rate * dc
print(f"GD result m: {m_gd:.6f}, c: {c_gd:.3f}")  # 出力: 勾配降下法結果

# --- 7. Newton–Raphson Reciprocal / ニュートン法逆数近似 ---
g = 1 / 645.0  # Approximate reciprocal of 645 (World War I casualties, thousands) / 第一次世界大戦犠牲者数(千人)の逆数近似
for _ in range(3):
    g = g * (2 - 645 * g)
print(f"Reciprocal approx of 645: {g:.6e}")  # 出力: 逆数近似

# --- 8. Sympy Derivative & Taylor / 微分 & テイラー ---
x_s, a_s = sp.symbols('x_s a_s')  # symbols definition / シンボル定義
expr = a_s * x_s  # Example: P = a * x / 例式
deriv = sp.diff(expr, x_s)  # derivative / 微分
taylor3 = sp.series(sp.sin(x_s), x_s, 0, 5)  # Taylor series for sin(x) up to x^4
print(f"Symbolic derivative of a*x: {deriv}")  # 出力: 微分結果
print(f"Taylor series of sin(x): {taylor3}")  # 出力: テイラー展開

# --- 9. Trigonometric Regression / トライゴンメトリック回帰 (周期的歴史イベント) ---
# Synthetic periodic event data (e.g., solar cycles) / 太陽活動周期データの合成
np.random.seed(0)
x_trig = np.linspace(0, 11, 100)  # years within one solar cycle (~11 years)
noise_trig = np.random.randn(100) * 0.2
# Simulate cycle amplitude
y_trig = 50 + 40 * np.sin(2 * np.pi * x_trig / 11) + noise_trig
# Feature transform / 特徴量変換
design_trig = np.vstack([np.sin(2 * np.pi * x_trig / 11), np.cos(2 * np.pi * x_trig / 11)]).T
theta_trig = np.linalg.pinv(design_trig.T @ design_trig) @ design_trig.T @ y_trig
A_trig, B_trig = theta_trig
print(f"Trig regression coeffs A: {A_trig:.3f}, B: {B_trig:.3f}")
# Plot trig regression / プロット
plt.scatter(x_trig, y_trig, label='Data')
plt.plot(x_trig, A_trig * np.sin(2 * np.pi * x_trig / 11) + B_trig * np.cos(2 * np.pi * x_trig / 11), label='Trig Fit')
plt.title("Trigonometric Regression on Solar Cycle")
plt.xlabel("Year in Cycle")
plt.ylabel("Activity Level")
plt.legend()
plt.show()

# --- 10. Exponential Regression / 指数関数回帰 (世界GDP成長) ---
# Synthetic world GDP growth data / 世界GDP成長データの合成
np.random.seed(1)
x_exp = np.array([1960, 1970, 1980, 1990, 2000, 2010, 2020])
gdp_world = np.array([1.0, 2.0, 3.5, 6.0, 10.0, 18.0, 25.0]) + np.random.randn(7) * 1.0
# Fit exponential y = a * exp(b*x)
Y_log = np.log(gdp_world)
X_exp = np.vstack([np.ones_like(x_exp), x_exp]).T
beta_exp = np.linalg.pinv(X_exp.T @ X_exp) @ X_exp.T @ Y_log
a_exp = np.exp(beta_exp[0])
b_exp = beta_exp[1]
print(f"Exp regression a: {a_exp:.3f}, b: {b_exp:.6f}")
# Plot exponential fit
x_fit = np.linspace(1960, 2020, 100)
plt.scatter(x_exp, gdp_world, label='Data')
plt.plot(x_fit, a_exp * np.exp(b_exp * x_fit), label='Exp Fit')
plt.title("Exponential Regression on World GDP")
plt.xlabel("Year")
plt.ylabel("GDP (trillion USD)")
plt.legend()
plt.show()

# --- 11. Logarithmic Regression / 対数関数回帰 (識字率 vs GDP per capita) ---
# Synthetic literacy vs GDP per capita data / 識字率と一人当たりGDPのデータ合成
np.random.seed(2)
x_log = np.array([500, 1000, 2000, 5000, 10000, 20000])  # GDP per capita
literacy = 40 + 50 * np.log(x_log) / np.log(20000) + np.random.randn(6) * 2
# Fit logarithmic y = a + b * ln(x)
Phi_log = np.vstack([np.ones_like(x_log), np.log(x_log)]).T
beta_log = np.linalg.pinv(Phi_log.T @ Phi_log) @ Phi_log.T @ literacy
a_log, b_log = beta_log
print(f"Log regression a: {a_log:.3f}, b: {b_log:.3f}")
# Plot logarithmic fit
x_plot = np.linspace(500, 20000, 100)
plt.scatter(x_log, literacy, label='Data')
plt.plot(x_plot, a_log + b_log * np.log(x_plot), label='Log Fit')
plt.title("Logarithmic Regression: Literacy vs GDP per Capita")
plt.xlabel("GDP per Capita (USD)")
plt.ylabel("Literacy Rate (%)")
plt.legend()
plt.show()

# --- 12. t-Test for Coefficients / t検定 ---
# Reuse OLS model from section 2
t_model = LinearRegression().fit(years_jp.reshape(-1,1), pop_jp)
beta = t_model.coef_[0]
se = np.sqrt(ss_res / (len(years_jp) - 2) / np.sum((years_jp - mean_year)**2))  # 標準誤差

t_stat = beta / se
p_val = stats.t.sf(abs(t_stat), df=len(years_jp)-2) * 2
print(f"t-stat: {t_stat:.3f}, p-value: {p_val:.3f}")

# --- 13. Cross-Validation / 交差検証 ---
cvs = cross_val_score(LinearRegression(), years_jp.reshape(-1,1), pop_jp, cv=3)
print(f"Cross-validation R² scores: {cvs}")

# --- 14. Numeric Integration of Residual CDF / 残差CDF数値積分 ---
residuals = pop_jp - t_model.predict(years_jp.reshape(-1,1))
x_vals = np.sort(residuals)
cdf_vals = np.arange(1, len(residuals)+1) / len(residuals)
risk = np.trapz(cdf_vals, x_vals)
print(f"Integrated residual CDF (risk metric): {risk:.3f}")

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?