0
0

統計学メモ

Last updated at Posted at 2024-07-22
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

# 事前分布: 仮定として平均が50、標準偏差が10の正規分布
mu_prior = 50
sigma_prior = 10
prior = norm(mu_prior, sigma_prior)

# 観測データ: 平均が55、標準偏差が5の正規分布に従うデータを観測
mu_likelihood = 55
sigma_likelihood = 5
likelihood = norm(mu_likelihood, sigma_likelihood)

# 事後分布を計算: 正規分布同士のベイズ更新
# 事後分布の平均と分散を計算
sigma_posterior = 1 / np.sqrt((1/sigma_prior**2) + (1/sigma_likelihood**2))
mu_posterior = sigma_posterior**2 * ((mu_prior/sigma_prior**2) + (mu_likelihood/sigma_likelihood**2))
posterior = norm(mu_posterior, sigma_posterior)

# 可視化
x = np.linspace(30, 80, 1000)

plt.figure(figsize=(10, 6))
plt.plot(x, prior.pdf(x), label=f'Prior (mean={mu_prior}, std={sigma_prior})', color='blue')
plt.plot(x, likelihood.pdf(x), label=f'Likelihood (mean={mu_likelihood}, std={sigma_likelihood})', color='green')
plt.plot(x, posterior.pdf(x), label=f'Posterior (mean={mu_posterior:.2f}, std={sigma_posterior:.2f})', color='red')

plt.title('Bayesian Inference with Normal Distributions')
plt.xlabel('Parameter value')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()



import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2

# カイ2乗分布の自由度
df = 4

# x軸の値
x = np.linspace(0, 20, 1000)

# カイ2乗分布
chi2_dist = chi2(df)

# カイ2乗分布のプロット
plt.figure(figsize=(10, 6))
plt.plot(x, chi2_dist.pdf(x), 'r-', label=f'Chi-squared Distribution (df={df})')

plt.title('Chi-squared Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()



from scipy import stats
import numpy as np

# サンプルデータ
np.random.seed(0)
data1 = np.random.normal(50, 10, 100)
data2 = np.random.normal(55, 10, 100)

# 2サンプルt検定
t_stat, p_value = stats.ttest_ind(data1, data2)

print(f"t-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")



from scipy import stats
import numpy as np

# サンプルデータ
np.random.seed(0)
group1 = np.random.normal(50, 10, 100)
group2 = np.random.normal(55, 10, 100)
group3 = np.random.normal(60, 10, 100)

# 分散分析(ANOVA)
f_stat, p_value = stats.f_oneway(group1, group2, group3)

print(f"F-statistic: {f_stat:.3f}")
print(f"P-value: {p_value:.3f}")



import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm

# 1. サンプルデータ生成
np.random.seed(0)
data1 = np.random.normal(50, 10, 100)
data2 = np.random.normal(55, 10, 100)
data = np.array([55, 65, 75, 85, 95])

# 2. 正規分布のプロット
mu = np.mean(data1)
sigma = np.std(data1)
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000)
norm_dist = norm.pdf(x, mu, sigma)

# 3. 偏差値の計算
mean = np.mean(data)
std_dev = np.std(data)
z_scores = (data - mean) / std_dev
deviation_values = 50 + 10 * z_scores

# 4. t検定の実行
t_stat, p_value = stats.ttest_ind(data1, data2)

# 5. プロットの作成
plt.figure(figsize=(15, 10))

# 正規分布
plt.subplot(2, 2, 1)
plt.plot(x, norm_dist, 'b-', label=f'Normal Distribution\n(mean={mu:.2f}, std={sigma:.2f})')
plt.title('Normal Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)

# 偏差値の棒グラフ
plt.subplot(2, 2, 2)
plt.bar(range(len(deviation_values)), deviation_values, color='orange')
plt.title('Deviation Values (偏差値)')
plt.xlabel('Data Points')
plt.ylabel('Deviation Value')
plt.grid(True)

# 標準偏差の表示
plt.subplot(2, 2, 3)
plt.hist(data, bins=5, color='purple', alpha=0.7, rwidth=0.85)
plt.axvline(mean, color='r', linestyle='dashed', linewidth=2)
plt.axvline(mean + std_dev, color='g', linestyle='dashed', linewidth=2)
plt.axvline(mean - std_dev, color='g', linestyle='dashed', linewidth=2)
plt.title(f'Standard Deviation (σ={std_dev:.2f})')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)

# t検定の結果
plt.subplot(2, 2, 4)
labels = ['t-test P-value']
p_values = [p_value]
plt.bar(labels, p_values, color='blue')
plt.title('P-value from t-test')
plt.ylabel('P-value')
plt.ylim(0, 1)
plt.text(0, p_value + 0.02, f"{p_value:.3f}", ha='center', fontsize=12)

plt.suptitle('Statistical Analysis: Normal Distribution, Deviation, t-test, Standard Deviation', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()




import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(0)
n = 100
x = np.random.normal(0, 1, n)  # Independent variable
group = np.random.choice(['A', 'B'], n)  # Categorical variable
covariate = np.random.normal(5, 2, n)  # Covariate
error = np.random.normal(0, 1, n)
y = 2 * x + 3 * (group == 'B') + 4 * covariate + error  # Dependent variable

# Create a DataFrame
df = pd.DataFrame({
    'x': x,
    'group': group,
    'covariate': covariate,
    'y': y
})

# Fit ANCOVA model
model = smf.ols('y ~ x + group + covariate', data=df).fit()

# Print model summary
print(model.summary())

# Plot the results
plt.figure(figsize=(10, 6))
for label, color in zip(['A', 'B'], ['blue', 'red']):
    subset = df[df['group'] == label]
    plt.scatter(subset['x'], subset['y'], label=f'Group {label}', color=color)
    plt.plot(subset['x'], model.fittedvalues[subset.index], color=color)

plt.xlabel('x')
plt.ylabel('y')
plt.title('ANCOVA Analysis')
plt.legend()
plt.show()



import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import matplotlib.pyplot as plt

# Generate synthetic data
np.random.seed(0)
n = 100
x1 = np.random.normal(0, 1, n)  # Numerical variable
x2 = np.random.choice(['A', 'B', 'C'], n)  # Categorical variable
y = 2 * x1 + 3 * (x2 == 'B') + 4 * (x2 == 'C') + np.random.normal(0, 1, n)  # Dependent variable

# Create DataFrame
df = pd.DataFrame({
    'x1': x1,
    'x2': x2,
    'y': y
})

# Convert categorical variable to dummy variables
encoder = OneHotEncoder(drop='first', sparse=False)
dummy_vars = encoder.fit_transform(df[['x2']])
dummy_df = pd.DataFrame(dummy_vars, columns=encoder.get_feature_names_out(['x2']))

# Combine dummy variables with original data
df = df.join(dummy_df).drop('x2', axis=1)

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('y', axis=1))

# Apply PCA
pca = PCA()
principal_components = pca.fit_transform(scaled_data)

# Create DataFrame for PCA results
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])])
pca_df['y'] = df['y']

# Print PCA variance ratio
print("Explained variance ratio by each principal component:")
print(pca.explained_variance_ratio_)

# Plot the first two principal components
plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['y'], cmap='viridis')
plt.colorbar(scatter, label='y')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - First Two Principal Components')
plt.show()




import numpy as np
import pandas as pd
from sklearn.decomposition import FactorAnalysis
from scipy.stats import mannwhitneyu
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

# Seed for reproducibility
np.random.seed(0)

# Generate synthetic data for Factor Analysis
n = 200
data = pd.DataFrame({
    'feature1': np.random.normal(0, 1, n),
    'feature2': np.random.normal(0, 1, n),
    'feature3': np.random.normal(0, 1, n),
    'feature4': np.random.normal(0, 1, n),
    'feature5': np.random.normal(0, 1, n)
})

# Apply Factor Analysis
fa = FactorAnalysis(n_components=2)
factors = fa.fit_transform(data)
factor_df = pd.DataFrame(factors, columns=['Factor1', 'Factor2'])

print("Factor Analysis Components:")
print(factor_df.head())

# Nonparametric Test: Mann-Whitney U test
# Generate synthetic data for the test
group1 = np.random.normal(0, 1, 50)
group2 = np.random.normal(0.5, 1, 50)
stat, p_value = mannwhitneyu(group1, group2)
print("\nMann-Whitney U test:")
print(f"U statistic: {stat}")
print(f"P-value: {p_value}")

# Survival Time Analysis
# Generate synthetic survival data
n_survival = 100
time = np.random.exponential(10, n_survival)
event_observed = np.random.binomial(1, 0.7, n_survival)
group = np.random.choice(['A', 'B'], n_survival)

# Fit Kaplan-Meier estimator
kmf = KaplanMeierFitter()
plt.figure(figsize=(12, 6))

for grp in ['A', 'B']:
    kmf.fit(durations=time[group == grp], event_observed=event_observed[group == grp], label=f'Group {grp}')
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Curves')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.legend()
plt.show()




import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, wilcoxon, kruskal, spearmanr
import matplotlib.pyplot as plt

# Seed for reproducibility
np.random.seed(0)

# Generate synthetic data
n = 50
group1 = np.random.normal(0, 1, n)
group2 = np.random.normal(1, 1, n)
paired_data1 = np.random.normal(0, 1, n)
paired_data2 = paired_data1 + np.random.normal(0, 1, n)
groups = [np.random.normal(0, 1, n), np.random.normal(1, 1, n), np.random.normal(2, 1, n)]

# Mann-Whitney U Test
stat, p_value = mannwhitneyu(group1, group2)
print("Mann-Whitney U Test:")
print(f"U statistic: {stat}")
print(f"P-value: {p_value}")

# Wilcoxon Signed-Rank Test
stat, p_value = wilcoxon(paired_data1, paired_data2)
print("\nWilcoxon Signed-Rank Test:")
print(f"Statistic: {stat}")
print(f"P-value: {p_value}")

# Kruskal-Wallis H Test
stat, p_value = kruskal(*groups)
print("\nKruskal-Wallis H Test:")
print(f"H statistic: {stat}")
print(f"P-value: {p_value}")

# Spearman's Rank Correlation
correlation, p_value = spearmanr(group1, group2)
print("\nSpearman's Rank Correlation:")
print(f"Correlation coefficient: {correlation}")
print(f"P-value: {p_value}")

# Plotting the data for visual inspection
plt.figure(figsize=(12, 6))

# Plot for Mann-Whitney U Test
plt.subplot(2, 2, 1)
plt.boxplot([group1, group2], labels=['Group 1', 'Group 2'])
plt.title("Mann-Whitney U Test")

# Plot for Wilcoxon Signed-Rank Test
plt.subplot(2, 2, 2)
plt.scatter(paired_data1, paired_data2)
plt.xlabel('Paired Data 1')
plt.ylabel('Paired Data 2')
plt.title("Wilcoxon Signed-Rank Test")

# Plot for Kruskal-Wallis H Test
plt.subplot(2, 2, 3)
plt.boxplot(groups, labels=['Group 1', 'Group 2', 'Group 3'])
plt.title("Kruskal-Wallis H Test")

# Plot for Spearman's Rank Correlation
plt.subplot(2, 2, 4)
plt.scatter(group1, group2)
plt.xlabel('Group 1')
plt.ylabel('Group 2')
plt.title("Spearman's Rank Correlation")

plt.tight_layout()
plt.show()





import numpy as np
import pandas as pd
from sklearn.decomposition import FactorAnalysis
import matplotlib.pyplot as plt

# Seed for reproducibility
np.random.seed(0)

# Generate synthetic data
n_samples = 200
n_features = 5
data = np.random.normal(0, 1, (n_samples, n_features))

# Add some structure to the data
data[:, 1] += 0.5 * data[:, 0]
data[:, 2] += 0.5 * data[:, 1]
data[:, 3] += 0.5 * data[:, 2]
data[:, 4] += 0.5 * data[:, 3]

# Create DataFrame
df = pd.DataFrame(data, columns=[f'Feature{i+1}' for i in range(n_features)])

# Perform Factor Analysis
n_factors = 2
fa = FactorAnalysis(n_components=n_factors)
factors = fa.fit_transform(df)

# Create DataFrame for factors
factor_df = pd.DataFrame(factors, columns=[f'Factor{i+1}' for i in range(n_factors)])

# Print factor loadings
print("Factor Loadings:")
print(pd.DataFrame(fa.components_.T, columns=[f'Factor{i+1}' for i in range(n_factors)], index=df.columns))

# Plot the factors
plt.figure(figsize=(12, 6))
plt.scatter(factor_df['Factor1'], factor_df['Factor2'])
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
plt.title('Factor Analysis - Factor 1 vs Factor 2')
plt.grid(True)
plt.show()



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# サンプルデータ
data = [1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9]

# 度数分布を計算
frequency_distribution = pd.Series(data).value_counts().sort_index()

# 相対度数分布を計算
relative_frequency_distribution = frequency_distribution / len(data)

# 累積度数分布を計算
cumulative_frequency_distribution = frequency_distribution.cumsum()

# 累積相対度数分布を計算
cumulative_relative_frequency_distribution = relative_frequency_distribution.cumsum()

# 度数分布を表示
print("度数分布:")
print(frequency_distribution)

# 相対度数分布を表示
print("\n相対度数分布:")
print(relative_frequency_distribution)

# 累積度数分布を表示
print("\n累積度数分布:")
print(cumulative_frequency_distribution)

# 累積相対度数分布を表示
print("\n累積相対度数分布:")
print(cumulative_relative_frequency_distribution)

# 度数分布をプロット
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
frequency_distribution.plot(kind='bar')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Frequency Distribution')

plt.subplot(2, 2, 2)
relative_frequency_distribution.plot(kind='bar')
plt.xlabel('Value')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequency Distribution')

plt.subplot(2, 2, 3)
cumulative_frequency_distribution.plot(kind='bar')
plt.xlabel('Value')
plt.ylabel('Cumulative Frequency')
plt.title('Cumulative Frequency Distribution')

plt.subplot(2, 2, 4)
cumulative_relative_frequency_distribution.plot(kind='bar')
plt.xlabel('Value')
plt.ylabel('Cumulative Relative Frequency')
plt.title('Cumulative Relative Frequency Distribution')

plt.tight_layout()
plt.show()


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# サンプルデータ
data = np.random.normal(loc=0, scale=1, size=1000)  # 平均0、標準偏差1の正規分布に従うデータ

# カーネル密度推定のプロット
sns.kdeplot(data, shade=True)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Kernel Density Estimation')
plt.show()

import numpy as np

# サンプルデータ
data = [1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9]

# 平均を計算
mean = np.mean(data)

# 標本分散を計算
sample_variance = np.var(data, ddof=0)

# 不偏分散を計算
unbiased_variance = np.var(data, ddof=1)

# 標準偏差を計算
std_dev = np.std(data, ddof=1)

# 変動係数を計算
coefficient_of_variation = std_dev / mean

# 結果を表示
print(f"平均: {mean}")
print(f"標本分散: {sample_variance}")
print(f"不偏分散: {unbiased_variance}")
print(f"標準偏差: {std_dev}")
print(f"変動係数: {coefficient_of_variation}")



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# サンプルデータ
data = [1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9]

# データの度数分布を計算
frequency_distribution = pd.Series(data).value_counts().sort_index()

# 度数分布を表示
print("度数分布:")
print(frequency_distribution)

# 度数分布をプロット
frequency_distribution.plot(kind='bar')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Frequency Distribution')
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# サンプルデータ
data = [1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9]

# 最小値、最大値、中央値、四分位点を計算
min_value = np.min(data)
max_value = np.max(data)
median = np.median(data)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)

# 結果を表示
print(f"最小値: {min_value}")
print(f"最大値: {max_value}")
print(f"中央値: {median}")
print(f"第一四分位点 (Q1): {q1}")
print(f"第三四分位点 (Q3): {q3}")

# データの分布をプロット
plt.figure(figsize=(12, 6))

# ヒストグラムとKDEプロット
sns.histplot(data, kde=True, bins=10, color='skyblue', stat='density')
plt.axvline(min_value, color='r', linestyle='--', label='Min')
plt.axvline(max_value, color='g', linestyle='--', label='Max')
plt.axvline(median, color='b', linestyle='-', label='Median')
plt.axvline(q1, color='purple', linestyle='--', label='Q1')
plt.axvline(q3, color='orange', linestyle='--', label='Q3')

# プロットのラベルとタイトル
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Distribution with Min, Max, Median, Q1, Q3')
plt.legend()

# プロットを表示
plt.show()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# サンプルデータの生成
np.random.seed(0)
data = pd.DataFrame({
    'X': np.random.normal(0, 1, 100),
    'Y': np.random.normal(5, 2, 100),
    'Z': np.random.normal(-2, 1, 100)
})

# 分散共分散行列を計算
cov_matrix = data.cov()

# ピアソンの積率相関係数を計算
pearson_corr = data.corr(method='pearson')

# 相関行列を計算
corr_matrix = data.corr()

# 結果を表示
print("分散共分散行列:")
print(cov_matrix)
print("\nピアソンの積率相関係数:")
print(pearson_corr)
print("\n相関行列:")
print(corr_matrix)

# 相関行列のヒートマップをプロット
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

# サンプルデータ(離散)
discrete_data = [1, 2, 2, 3, 3, 3, 4, 4, 5]

# PMFの計算
values, counts = np.unique(discrete_data, return_counts=True)
pmf = counts / len(discrete_data)

# 一様分布のサンプルデータ生成
a, b = 0, 10  # 下限と上限
uniform_data = np.random.uniform(a, b, 1000)

# 正規分布のサンプルデータ生成
mu, sigma = 0, 1  # 平均と標準偏差
normal_data = np.random.normal(mu, sigma, 1000)

# プロット設定
plt.figure(figsize=(16, 12))

# PMFプロット
plt.subplot(3, 1, 1)
plt.stem(values, pmf, basefmt=" ", use_line_collection=True)
plt.xlabel('Value')
plt.ylabel('Probability')
plt.title('Probability Mass Function (PMF)')

# 一様分布プロット
plt.subplot(3, 1, 2)
plt.hist(uniform_data, bins=30, density=True, alpha=0.6, color='g', edgecolor='black')
x_uniform = np.linspace(a, b, 100)
pdf_uniform = [1 / (b - a)] * len(x_uniform)
plt.plot(x_uniform, pdf_uniform, 'k', linewidth=2)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Uniform Distribution')

# PDFプロット
plt.subplot(3, 1, 3)
sns.histplot(normal_data, bins=30, kde=True, stat="density", color='b', edgecolor='black')
x_normal = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
pdf_normal = norm.pdf(x_normal, mu, sigma)
plt.plot(x_normal, pdf_normal, 'k', linewidth=2)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Probability Density Function (PDF)')

# プロット表示
plt.tight_layout()
plt.show()



import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

# サンプルデータの生成
np.random.seed(0)
data = np.random.normal(0, 1, 1000)  # 平均0、標準偏差1の正規分布に従うデータ

# 理論的なCDFを計算
x = np.linspace(min(data), max(data), 1000)
cdf = norm.cdf(x, np.mean(data), np.std(data))

# データのCDFを計算
sorted_data = np.sort(data)
yvals = np.arange(1, len(sorted_data) + 1) / len(sorted_data)

# プロット設定
plt.figure(figsize=(10, 6))

# データのCDFプロット
plt.step(sorted_data, yvals, label='Empirical CDF')

# 理論的なCDFプロット
plt.plot(x, cdf, 'k--', label='Theoretical CDF')

# プロットのラベルとタイトル
plt.xlabel('Value')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Distribution Function (CDF)')
plt.legend()

# プロットを表示
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import multivariate_normal

# サンプルデータの生成
np.random.seed(0)
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # 共分散行列
data = np.random.multivariate_normal(mean, cov, 1000)

# データをDataFrameに変換
df = pd.DataFrame(data, columns=['X', 'Y'])

# 同時確率分布の計算(2次元ヒストグラム)
joint_hist, xedges, yedges = np.histogram2d(df['X'], df['Y'], bins=30, density=True)

# 周辺分布の計算
x_marginal = np.sum(joint_hist, axis=1) * np.diff(yedges)[0]
y_marginal = np.sum(joint_hist, axis=0) * np.diff(xedges)[0]

# 同時確率分布のプロット
plt.figure(figsize=(18, 6))

# 1. 同時確率分布
plt.subplot(1, 3, 1)
plt.imshow(joint_hist.T, origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', cmap='Blues')
plt.colorbar()
plt.title('Joint Probability Distribution')
plt.xlabel('X')
plt.ylabel('Y')

# 2. Xの周辺分布
plt.subplot(1, 3, 2)
plt.plot(xedges[:-1], x_marginal, label='Marginal Distribution of X', color='blue')
plt.fill_between(xedges[:-1], x_marginal, alpha=0.2, color='blue')
plt.xlabel('X')
plt.ylabel('Probability')
plt.title('Marginal Distribution of X')

# 3. Yの周辺分布
plt.subplot(1, 3, 3)
plt.plot(yedges[:-1], y_marginal, label='Marginal Distribution of Y', color='red')
plt.fill_between(yedges[:-1], y_marginal, alpha=0.2, color='red')
plt.xlabel('Y')
plt.ylabel('Probability')
plt.title('Marginal Distribution of Y')

plt.tight_layout()
plt.show()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# サンプルデータの生成
np.random.seed(0)
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # 共分散行列
data = np.random.multivariate_normal(mean, cov, 1000)

# データをDataFrameに変換
df = pd.DataFrame(data, columns=['X', 'Y'])

# Xがある範囲にあるときのYの条件付き確率分布
x_range = (-1, 1)  # Xの範囲
condition = (df['X'] >= x_range[0]) & (df['X'] <= x_range[1])
conditional_data = df[condition]['Y']

# Yのヒストグラムとカーネル密度推定(KDE)をプロット
plt.figure(figsize=(12, 6))

# 条件付き確率分布のプロット
sns.histplot(conditional_data, kde=True, stat='density', color='blue', edgecolor='black')
plt.xlabel('Y')
plt.ylabel('Density')
plt.title(f'Conditional Distribution of Y given X in range {x_range}')
plt.show()


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import multivariate_normal

# サンプルデータの生成
np.random.seed(0)
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # 共分散行列
data = np.random.multivariate_normal(mean, cov, 1000)

# データをDataFrameに変換
df = pd.DataFrame(data, columns=['X', 'Y'])

# 同時確率分布の計算(2次元ヒストグラム)
plt.figure(figsize=(18, 6))

# 1. 2次元ヒストグラム
plt.subplot(1, 2, 1)
plt.hist2d(df['X'], df['Y'], bins=30, density=True, cmap='Blues', edgecolors='k')
plt.colorbar(label='Density')
plt.title('2D Histogram of Joint Distribution')
plt.xlabel('X')
plt.ylabel('Y')

# 2. カーネル密度推定(KDE)
plt.subplot(1, 2, 2)
sns.kdeplot(data=df, x='X', y='Y', cmap='Blues', fill=True)
plt.title('Kernel Density Estimation (KDE)')
plt.xlabel('X')
plt.ylabel('Y')

plt.tight_layout()
plt.show()



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# サンプルデータの生成
np.random.seed(0)
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # 共分散行列
data = np.random.multivariate_normal(mean, cov, 1000)

# データをDataFrameに変換
df = pd.DataFrame(data, columns=['X', 'Y'])

# 共分散行列の計算
cov_matrix = np.cov(df['X'], df['Y'])
print("Covariance Matrix:\n", cov_matrix)

# 相関係数の計算
correlation_matrix = np.corrcoef(df['X'], df['Y'])
print("Correlation Matrix:\n", correlation_matrix)

# プロット設定
plt.figure(figsize=(12, 6))

# 散布図と回帰線
plt.subplot(1, 2, 1)
sns.scatterplot(data=df, x='X', y='Y')
sns.regplot(data=df, x='X', y='Y', scatter=False, color='red')
plt.title('Scatter Plot with Regression Line')
plt.xlabel('X')
plt.ylabel('Y')

# 相関係数のヒートマップ
plt.subplot(1, 2, 2)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True)
plt.title('Correlation Matrix Heatmap')

plt.tight_layout()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 二値確率変数のサンプルデータ生成
np.random.seed(0)
n_samples = 1000
p = 0.3  # 成功の確率
data = np.random.binomial(1, p, n_samples)  # 成功(1)または失敗(0)

# データをDataFrameに変換
df = pd.DataFrame(data, columns=['BinaryVariable'])

# 確率分布の計算
prob_success = np.mean(df['BinaryVariable'])
prob_failure = 1 - prob_success

print(f"Probability of Success (1): {prob_success:.2f}")
print(f"Probability of Failure (0): {prob_failure:.2f}")

# 期待値と分散の計算
expected_value = np.mean(df['BinaryVariable'])
variance = np.var(df['BinaryVariable'])

print(f"Expected Value (Mean): {expected_value:.2f}")
print(f"Variance: {variance:.2f}")

# 二値確率変数のプロット
plt.figure(figsize=(12, 6))

# 確率分布のバーグラフ
plt.subplot(1, 2, 1)
sns.barplot(x=['0', '1'], y=[prob_failure, prob_success], palette='Blues')
plt.xlabel('Binary Variable')
plt.ylabel('Probability')
plt.title('Probability Distribution of Binary Variable')

# ヒストグラム
plt.subplot(1, 2, 2)
sns.histplot(df['BinaryVariable'], bins=2, discrete=True, color='blue', edgecolor='black')
plt.xlabel('Binary Variable')
plt.ylabel('Frequency')
plt.title('Histogram of Binary Variable')

plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 成功確率の設定
p = 0.3  # 成功の確率
n_samples = 1000  # サンプル数

# ベルヌーイ試行のサンプルデータ生成
np.random.seed(0)
data = np.random.binomial(1, p, n_samples)  # 成功(1)または失敗(0)

# 成功確率の計算
prob_success = np.mean(data)
prob_failure = 1 - prob_success

print(f"設定された成功確率 (p): {p:.2f}")
print(f"計算された成功確率 (実測値): {prob_success:.2f}")
print(f"計算された失敗確率 (1 - p): {prob_failure:.2f}")

# 成功確率のプロット
plt.figure(figsize=(12, 6))

# 確率分布のバーグラフ
plt.subplot(1, 2, 1)
sns.barplot(x=['Failure (0)', 'Success (1)'], y=[1 - prob_success, prob_success], palette='Blues')
plt.xlabel('Outcome')
plt.ylabel('Probability')
plt.title('Probability Distribution of Bernoulli Trial')

# ヒストグラム
plt.subplot(1, 2, 2)
sns.histplot(data, bins=2, discrete=True, color='blue', edgecolor='black')
plt.xlabel('Bernoulli Trial Result')
plt.ylabel('Frequency')
plt.title('Histogram of Bernoulli Trial Results')

plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 二項分布のパラメータ設定
n = 10  # 試行回数
p = 0.3  # 成功の確率
n_samples = 1000  # サンプル数

# 二項分布のサンプルデータ生成
np.random.seed(0)
data = np.random.binomial(n, p, n_samples)

# 期待値と分散の計算
expected_value = n * p
variance = n * p * (1 - p)

print(f"期待値(平均): {expected_value:.2f}")
print(f"分散: {variance:.2f}")

# ヒストグラムのプロット
plt.figure(figsize=(10, 6))

# ヒストグラム
sns.histplot(data, bins=range(n + 2), discrete=True, color='blue', edgecolor='black')
plt.xlabel('Number of Successes')
plt.ylabel('Frequency')
plt.title('Histogram of Binomial Distribution')

plt.axvline(expected_value, color='red', linestyle='--', label=f'Expected Value (Mean) = {expected_value:.2f}')
plt.legend()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# 正規分布のパラメータ設定
mu = 0      # 平均
sigma = 1   # 標準偏差

# データ範囲の設定
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000)

# 確率密度関数の計算
pdf = stats.norm.pdf(x, mu, sigma)

# PDFのプロット
plt.figure(figsize=(10, 6))
plt.plot(x, pdf, label=f'Normal Distribution\n$\mu={mu}$, $\sigma={sigma}$', color='blue')
plt.fill_between(x, pdf, alpha=0.2, color='blue')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.title('Probability Density Function of Normal Distribution')
plt.legend()
plt.grid(True)
plt.show()



import numpy as np

# 母集団データの生成
np.random.seed(0)
population_data = np.random.normal(loc=50, scale=10, size=1000)  # 平均50, 標準偏差10の正規分布

# 母平均、母分散、母標準偏差の計算
population_mean = np.mean(population_data)
population_variance = np.var(population_data)
population_std_dev = np.sqrt(population_variance)

print(f"母平均 (Population Mean): {population_mean:.2f}")
print(f"母分散 (Population Variance): {population_variance:.2f}")
print(f"母標準偏差 (Population Standard Deviation): {population_std_dev:.2f}")

import numpy as np

# 母集団データの生成
np.random.seed(0)
population_data = np.random.normal(loc=50, scale=10, size=1000)  # 平均50, 標準偏差10の正規分布

# 標本の抽出
sample_size = 30
sample_data = np.random.choice(population_data, size=sample_size, replace=False)

# 標本平均の計算
sample_mean = np.mean(sample_data)

print(f"母集団の真の平均: {np.mean(population_data):.2f}")
print(f"標本平均(推定値): {sample_mean:.2f}")



import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 母集団のパラメータ設定
np.random.seed(0)
population_mean = 50
population_std_dev = 10
population_size = 10000
population_data = np.random.normal(loc=population_mean, scale=population_std_dev, size=population_size)

# サンプルサイズの設定
sample_sizes = [10, 30, 100]
n_simulations = 1000  # シミュレーションの回数

plt.figure(figsize=(18, 6))

for i, sample_size in enumerate(sample_sizes):
    sample_means = []

    for _ in range(n_simulations):
        sample_data = np.random.choice(population_data, size=sample_size, replace=False)
        sample_mean = np.mean(sample_data)
        sample_means.append(sample_mean)
    
    plt.subplot(1, 3, i+1)
    sns.histplot(sample_means, bins=30, kde=True, color='blue', stat='density')
    plt.axvline(population_mean, color='red', linestyle='--', label=f'Population Mean: {population_mean}')
    plt.title(f'Sample Size = {sample_size}')
    plt.xlabel('Sample Mean')
    plt.ylabel('Density')
    plt.legend()

plt.tight_layout()
plt.show()

import numpy as np

# 母集団データの生成
np.random.seed(0)
population_data = np.random.normal(loc=50, scale=10, size=1000)  # 平均50, 標準偏差10の正規分布

# 標本の抽出
sample_size = 30
sample_data = np.random.choice(population_data, size=sample_size, replace=False)

# 標本分散と不偏分散の計算
sample_mean = np.mean(sample_data)
sample_variance = np.mean((sample_data - sample_mean) ** 2)
sample_variance_unbiased = np.var(sample_data, ddof=1)

print(f"標本分散 (Sample Variance): {sample_variance:.2f}")
print(f"不偏分散 (Unbiased Sample Variance): {sample_variance_unbiased:.2f}")


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 母集団データの生成
np.random.seed(0)
population_mean = 50
population_std_dev = 10
population_size = 1000
population_data = np.random.normal(loc=population_mean, scale=population_std_dev, size=population_size)

# 標本の抽出
sample_size = 30
sample_data = np.random.choice(population_data, size=sample_size, replace=False)

# 標本平均と標準偏差の計算
sample_mean = np.mean(sample_data)
sample_std_dev = np.std(sample_data, ddof=0)  # 標本標準偏差(標本全体の分散を用いる)

# プロットの作成
plt.figure(figsize=(12, 6))

# バイオリンプロット
sns.violinplot(data=[population_data, sample_data], inner="quartile", palette="muted")

# 平均の表示
plt.axhline(population_mean, color='red', linestyle='--', label=f'Population Mean: {population_mean:.2f}')
plt.axhline(sample_mean, color='blue', linestyle='--', label=f'Sample Mean: {sample_mean:.2f}')

plt.xlabel('Group')
plt.ylabel('Value')
plt.title('Population and Sample Data Distribution')
plt.legend()
plt.grid(True)

plt.show()

# 結果の表示
print(f"母平均 (Population Mean): {population_mean:.2f}")
print(f"標本平均 (Sample Mean): {sample_mean:.2f}")
print(f"標本標準偏差 (Sample Standard Deviation): {sample_std_dev:.2f}")


import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# 設定
x = np.linspace(0, 5, 1000)

# Chi-Square Distribution
df_chi2 = 3  # 自由度
chi2_dist = stats.chi2.pdf(x, df_chi2)

# t-Distribution
df_t = 10  # 自由度
t_dist = stats.t.pdf(x, df_t)

# F-Distribution
df1_f = 5   # 分子自由度
df2_f = 10  # 分母自由度
f_dist = stats.f.pdf(x, df1_f, df2_f)

# プロット
plt.figure(figsize=(18, 6))

# Chi-Square Distribution
plt.subplot(1, 3, 1)
plt.plot(x, chi2_dist, label=f'Chi-Square (df={df_chi2})')
plt.title('Chi-Square Distribution')
plt.xlabel('x')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

# t-Distribution
plt.subplot(1, 3, 2)
plt.plot(x, t_dist, label=f't-Distribution (df={df_t})', color='orange')
plt.title('t-Distribution')
plt.xlabel('x')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

# F-Distribution
plt.subplot(1, 3, 3)
plt.plot(x, f_dist, label=f'F-Distribution (df1={df1_f}, df2={df2_f})', color='green')
plt.title('F-Distribution')
plt.xlabel('x')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()



import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# 母集団データの生成
np.random.seed(0)
population_mean = 50
population_std_dev = 10
population_size = 1000
population_data = np.random.normal(loc=population_mean, scale=population_std_dev, size=population_size)

# 標本の抽出
sample_size = 30
sample_data = np.random.choice(population_data, size=sample_size, replace=False)

# 標本平均と標準誤差の計算
sample_mean = np.mean(sample_data)
sample_std_dev = np.std(sample_data, ddof=1)  # 標本標準偏差
standard_error = sample_std_dev / np.sqrt(sample_size)

# 信頼係数と臨界値の設定
confidence_level = 0.95
alpha = 1 - confidence_level
z_critical = stats.norm.ppf(1 - alpha/2)  # 正規分布の臨界値

# 信頼区間の計算
margin_of_error = z_critical * standard_error
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

# プロット
plt.figure(figsize=(10, 6))
plt.hist(sample_data, bins=20, alpha=0.6, color='g', edgecolor='black', density=True)
plt.axvline(sample_mean, color='blue', linestyle='--', label=f'Sample Mean: {sample_mean:.2f}')
plt.axvline(confidence_interval[0], color='red', linestyle='--', label=f'Lower CI: {confidence_interval[0]:.2f}')
plt.axvline(confidence_interval[1], color='red', linestyle='--', label=f'Upper CI: {confidence_interval[1]:.2f}')
plt.title('Sample Data Distribution with Confidence Interval')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# 結果の表示
print(f"標本平均 (Sample Mean): {sample_mean:.2f}")
print(f"標本標準偏差 (Sample Standard Deviation): {sample_std_dev:.2f}")
print(f"標準誤差 (Standard Error): {standard_error:.2f}")
print(f"信頼区間 (Confidence Interval): {confidence_interval}")

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# 母集団データの生成
np.random.seed(0)
population_mean = 50
population_std_dev = 10
population_size = 1000
population_data = np.random.normal(loc=population_mean, scale=population_std_dev, size=population_size)

# 標本の抽出
sample_size = 30
sample_data = np.random.choice(population_data, size=sample_size, replace=False)

# 母平均の区間推定
sample_mean = np.mean(sample_data)
sample_std_dev = np.std(sample_data, ddof=1)  # 標本標準偏差
standard_error = sample_std_dev / np.sqrt(sample_size)

# 信頼係数と臨界値の設定
confidence_level = 0.95
alpha = 1 - confidence_level

# 正規分布に基づく信頼区間(大きな標本サイズの場合)
z_critical = stats.norm.ppf(1 - alpha/2)
margin_of_error = z_critical * standard_error
mean_confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

# t分布に基づく信頼区間(標本サイズが小さい場合)
t_critical = stats.t.ppf(1 - alpha/2, df=sample_size-1)
margin_of_error_t = t_critical * standard_error
mean_confidence_interval_t = (sample_mean - margin_of_error_t, sample_mean + margin_of_error_t)

# 母分散の区間推定
sample_variance = np.var(sample_data, ddof=1)  # 標本分散
chi2_lower = stats.chi2.ppf(alpha/2, df=sample_size-1)
chi2_upper = stats.chi2.ppf(1 - alpha/2, df=sample_size-1)

variance_confidence_interval = (
    (sample_size - 1) * sample_variance / chi2_upper,
    (sample_size - 1) * sample_variance / chi2_lower
)

# 結果の表示
print(f"母平均の信頼区間 (正規分布ベース): {mean_confidence_interval}")
print(f"母平均の信頼区間 (t分布ベース): {mean_confidence_interval_t}")
print(f"母分散の信頼区間: {variance_confidence_interval}")

# データの可視化
plt.figure(figsize=(12, 6))

# 標本データのヒストグラム
plt.subplot(1, 2, 1)
plt.hist(sample_data, bins=20, alpha=0.6, color='g', edgecolor='black', density=True)
plt.axvline(sample_mean, color='blue', linestyle='--', label=f'Sample Mean: {sample_mean:.2f}')
plt.axvline(mean_confidence_interval[0], color='red', linestyle='--', label=f'Lower CI: {mean_confidence_interval[0]:.2f}')
plt.axvline(mean_confidence_interval[1], color='red', linestyle='--', label=f'Upper CI: {mean_confidence_interval[1]:.2f}')
plt.title('Sample Data Distribution with Mean CI')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

# 母分散の信頼区間の表示
plt.subplot(1, 2, 2)
plt.bar(['Lower Bound', 'Upper Bound'], [variance_confidence_interval[0], variance_confidence_interval[1]], color=['red', 'blue'])
plt.title('Variance Confidence Interval')
plt.ylabel('Variance')
plt.grid(True)

plt.tight_layout()
plt.show()



import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# 母集団データの生成
np.random.seed(0)
population_size = 1000

# 2つの異なる母集団データを生成(不等分散を考慮)
population1_mean = 50
population1_std_dev = 10
population1_data = np.random.normal(loc=population1_mean, scale=population1_std_dev, size=population_size)

population2_mean = 55
population2_std_dev = 15
population2_data = np.random.normal(loc=population2_mean, scale=population2_std_dev, size=population_size)

# 標本の抽出
sample_size1 = 30
sample_size2 = 30
sample_data1 = np.random.choice(population1_data, size=sample_size1, replace=False)
sample_data2 = np.random.choice(population2_data, size=sample_size2, replace=False)

# ウェルチのt検定
t_statistic, p_value = stats.ttest_ind(sample_data1, sample_data2, equal_var=False)  # equal_var=Falseで不等分散を考慮

# 結果の表示
print(f"ウェルチのt検定のt統計量: {t_statistic:.2f}")
print(f"P値: {p_value:.4f}")

# データの可視化
plt.figure(figsize=(10, 6))

# 標本データ1のヒストグラム
plt.hist(sample_data1, bins=20, alpha=0.6, color='blue', edgecolor='black', density=True, label='Sample 1')
plt.hist(sample_data2, bins=20, alpha=0.6, color='orange', edgecolor='black', density=True, label='Sample 2')

plt.title('Histogram of Sample Data')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

plt.show()


import numpy as np
from scipy.optimize import minimize

# データ生成
np.random.seed(0)
true_mean = 50
true_std_dev = 10
sample_size = 100
sample_data = np.random.normal(loc=true_mean, scale=true_std_dev, size=sample_size)

# 尤度関数の定義
def negative_log_likelihood(params, data):
    mu, sigma = params
    if sigma <= 0:  # 標準偏差は正でなければならない
        return np.inf
    likelihood = -np.sum(np.log(stats.norm.pdf(data, loc=mu, scale=sigma)))
    return likelihood

# 最尤推定
initial_params = [np.mean(sample_data), np.std(sample_data)]
result = minimize(negative_log_likelihood, initial_params, args=(sample_data,), bounds=[(None, None), (1e-5, None)])
estimated_mean, estimated_std_dev = result.x

# 結果の表示
print(f"推定された平均 (Estimated Mean): {estimated_mean:.2f}")
print(f"推定された標準偏差 (Estimated Std Dev): {estimated_std_dev:.2f}")

# データの可視化
import matplotlib.pyplot as plt
import scipy.stats as stats

plt.figure(figsize=(10, 6))

# ヒストグラム
plt.hist(sample_data, bins=20, alpha=0.6, color='blue', edgecolor='black', density=True, label='Sample Data')

# 推定された正規分布のPDF
x = np.linspace(min(sample_data), max(sample_data), 100)
pdf = stats.norm.pdf(x, loc=estimated_mean, scale=estimated_std_dev)
plt.plot(x, pdf, color='red', label=f'Estimated Normal Distribution\nMean: {estimated_mean:.2f}, Std Dev: {estimated_std_dev:.2f}')

plt.title('Histogram of Sample Data with Estimated Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

plt.show()

import numpy as np
import scipy.stats as stats
from scipy.optimize import minimize
import matplotlib.pyplot as plt

# データ生成(正規分布に従うデータ)
np.random.seed(0)
true_mean = 50
true_std_dev = 10
sample_size = 100
data = np.random.normal(loc=true_mean, scale=true_std_dev, size=sample_size)

# 尤度関数の定義(正規分布の場合)
def negative_log_likelihood(params, data):
    mu, sigma = params
    if sigma <= 0:
        return np.inf
    likelihood = -np.sum(np.log(stats.norm.pdf(data, loc=mu, scale=sigma)))
    return likelihood

# 最尤推定
initial_params = [np.mean(data), np.std(data)]
result = minimize(negative_log_likelihood, initial_params, args=(data,), bounds=[(None, None), (1e-5, None)])
estimated_mean, estimated_std_dev = result.x

# 結果の表示
print(f"推定された平均 (Estimated Mean): {estimated_mean:.2f}")
print(f"推定された標準偏差 (Estimated Std Dev): {estimated_std_dev:.2f}")

# データの可視化
plt.figure(figsize=(10, 6))

# ヒストグラム
plt.hist(data, bins=20, alpha=0.6, color='blue', edgecolor='black', density=True, label='Sample Data')

# 推定された正規分布のPDF
x = np.linspace(min(data), max(data), 100)
pdf = stats.norm.pdf(x, loc=estimated_mean, scale=estimated_std_dev)
plt.plot(x, pdf, color='red', label=f'Estimated Normal Distribution\nMean: {estimated_mean:.2f}, Std Dev: {estimated_std_dev:.2f}')

plt.title('Histogram of Sample Data with Estimated Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# 正規線形モデルのデータ生成
np.random.seed(0)
n = 100
x = np.random.uniform(-10, 10, size=n)
beta_0 = 5
beta_1 = 2
sigma = 3
epsilon = np.random.normal(0, sigma, size=n)
y = beta_0 + beta_1 * x + epsilon

# 尤度関数の定義(正規線形モデルの場合)
def negative_log_likelihood_linear(params, y, x):
    beta_0, beta_1, sigma = params
    if sigma <= 0:
        return np.inf
    residuals = y - (beta_0 + beta_1 * x)
    likelihood = -np.sum(np.log(stats.norm.pdf(residuals, loc=0, scale=sigma)))
    return likelihood

# 最尤推定(正規線形モデル)
initial_params_linear = [0, 0, np.std(y - (np.mean(y) + np.mean(x)))]  # 初期値
result_linear = minimize(negative_log_likelihood_linear, initial_params_linear, args=(y, x), bounds=[(None, None), (None, None), (1e-5, None)])
estimated_beta_0, estimated_beta_1, estimated_sigma = result_linear.x

# 結果の表示
print(f"推定された切片 (Estimated Beta_0): {estimated_beta_0:.2f}")
print(f"推定された傾き (Estimated Beta_1): {estimated_beta_1:.2f}")
print(f"推定された標準偏差 (Estimated Sigma): {estimated_sigma:.2f}")

# 回帰直線のプロット
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.6, color='blue', label='Data')
plt.plot(x, estimated_beta_0 + estimated_beta_1 * x, color='red', label=f'Estimated Regression Line\nBeta_0: {estimated_beta_0:.2f}, Beta_1: {estimated_beta_1:.2f}')
plt.title('Regression Line with Estimated Parameters')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# データ生成
np.random.seed(0)
n = 100
x = np.random.uniform(-10, 10, size=n).reshape(-1, 1)  # 特徴量
beta_0 = 5
beta_1 = 2
sigma = 3
epsilon = np.random.normal(0, sigma, size=n)  # 誤差項
y = beta_0 + beta_1 * x.flatten() + epsilon  # 目標変数

# 線形回帰モデルの訓練
model = LinearRegression()
model.fit(x, y)

# 予測値の計算
y_pred = model.predict(x)

# 損失関数の計算(平均二乗誤差)
mse = mean_squared_error(y, y_pred)

# 結果の表示
print(f"切片 (Intercept): {model.intercept_:.2f}")
print(f"傾き (Slope): {model.coef_[0]:.2f}")
print(f"平均二乗誤差 (MSE): {mse:.2f}")

# データと回帰直線のプロット
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.6, color='blue', label='Data')
plt.plot(x, y_pred, color='red', label=f'Regression Line\nIntercept: {model.intercept_:.2f}, Slope: {model.coef_[0]:.2f}')
plt.title('Linear Regression with MSE')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()



import numpy as np
import scipy.stats as stats
from scipy.optimize import minimize

# データ生成
np.random.seed(0)
n = 100
true_mean = 50
true_std_dev = 10
data = np.random.normal(loc=true_mean, scale=true_std_dev, size=n)

# 正規分布の確率密度関数
def normal_pdf(x, mean, std_dev):
    return stats.norm.pdf(x, loc=mean, scale=std_dev)

# 平均対数尤度の計算
def average_log_likelihood(params, data):
    mean, std_dev = params
    if std_dev <= 0:
        return -np.inf  # 標準偏差は正でなければならない
    log_likelihood = np.sum(np.log(normal_pdf(data, mean, std_dev)))
    return -log_likelihood  # minimize関数を使うので、最小化するために負の対数尤度を返す

# 最尤推定
initial_params = [np.mean(data), np.std(data)]
result = minimize(average_log_likelihood, initial_params, args=(data,), bounds=[(None, None), (1e-5, None)])
estimated_mean, estimated_std_dev = result.x

# 平均対数尤度の計算結果
log_likelihood = -result.fun
print(f"推定された平均 (Estimated Mean): {estimated_mean:.2f}")
print(f"推定された標準偏差 (Estimated Std Dev): {estimated_std_dev:.2f}")
print(f"平均対数尤度 (Average Log-Likelihood): {log_likelihood:.2f}")

# 相対エントロピーの計算(KL ダイバージェンス)
def kl_divergence(params, data, true_mean, true_std_dev):
    estimated_mean, estimated_std_dev = params
    true_pdf = normal_pdf(data, true_mean, true_std_dev)
    estimated_pdf = normal_pdf(data, estimated_mean, estimated_std_dev)
    kl_div = np.sum(true_pdf * np.log(true_pdf / estimated_pdf))
    return kl_div

# 最小化のための最適化
result_kl = minimize(kl_divergence, initial_params, args=(data, true_mean, true_std_dev), bounds=[(None, None), (1e-5, None)])
estimated_mean_kl, estimated_std_dev_kl = result_kl.x

# 相対エントロピーの計算結果
kl_divergence_value = result_kl.fun
print(f"KLダイバージェンスの最小化結果:")
print(f"推定された平均 (Estimated Mean): {estimated_mean_kl:.2f}")
print(f"推定された標準偏差 (Estimated Std Dev): {estimated_std_dev_kl:.2f}")
print(f"KLダイバージェンス (KL Divergence): {kl_divergence_value:.2f}")

# データの可視化
import matplotlib.pyplot as plt

# データと推定された正規分布のPDF
x = np.linspace(min(data), max(data), 100)
true_pdf = normal_pdf(x, true_mean, true_std_dev)
estimated_pdf = normal_pdf(x, estimated_mean, estimated_std_dev)

plt.figure(figsize=(12, 6))
plt.hist(data, bins=20, alpha=0.6, color='blue', edgecolor='black', density=True, label='Sample Data')
plt.plot(x, true_pdf, color='green', linestyle='--', label=f'True Normal Distribution\nMean: {true_mean}, Std Dev: {true_std_dev}')
plt.plot(x, estimated_pdf, color='red', label=f'Estimated Normal Distribution\nMean: {estimated_mean:.2f}, Std Dev: {estimated_std_dev:.2f}')
plt.title('Histogram and Normal Distributions')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()


import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.optimize import minimize

# データ生成
np.random.seed(0)
n = 100
true_mean = 50
true_std_dev = 10

# 正規分布に従うデータ
data = np.random.normal(loc=true_mean, scale=true_std_dev, size=n)

# Q-Qプロットの生成
def qq_plot(data, dist="norm"):
    stats.probplot(data, dist=dist, plot=plt)
    plt.title(f'Q-Q Plot ({dist.capitalize()})')
    plt.grid(True)
    plt.show()

qq_plot(data)

# ポアソン分布とガンマ分布の生成
lambda_poisson = 5
k_gamma = 2
theta_gamma = 2

poisson_data = np.random.poisson(lambda_poisson, size=n)
gamma_data = np.random.gamma(k_gamma, theta_gamma, size=n)

# 確率変数の期待値、平均値、標準偏差の計算
def calculate_statistics(data):
    mean = np.mean(data)
    std_dev = np.std(data, ddof=1)  # 不偏分散
    return mean, std_dev

mean_data, std_dev_data = calculate_statistics(data)
mean_poisson, std_dev_poisson = calculate_statistics(poisson_data)
mean_gamma, std_dev_gamma = calculate_statistics(gamma_data)

print(f'正規分布データの期待値 (Mean): {mean_data:.2f}, 標準偏差 (Std Dev): {std_dev_data:.2f}')
print(f'ポアソン分布データの期待値 (Mean): {mean_poisson:.2f}, 標準偏差 (Std Dev): {std_dev_poisson:.2f}')
print(f'ガンマ分布データの期待値 (Mean): {mean_gamma:.2f}, 標準偏差 (Std Dev): {std_dev_gamma:.2f}')

# ロジット関数
def logit(p):
    return np.log(p / (1 - p))

def inv_logit(x):
    return 1 / (1 + np.exp(-x))

# 2値判定のためのデータ生成
np.random.seed(0)
X = np.random.uniform(-10, 10, size=(n, 1))
y = (X.flatten() > 0).astype(int)  # Xが0より大きい場合に1、それ以外は0

# ロジスティック回帰モデルの訓練
model = LogisticRegression()
model.fit(X, y)

# 予測と精度の計算
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

print(f'ロジスティック回帰モデルの精度 (Accuracy): {accuracy:.2f}')

# データと決定境界のプロット
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, color='blue', label='Data')
x_values = np.linspace(-10, 10, 100).reshape(-1, 1)
decision_boundary = model.predict_proba(x_values)[:, 1] > 0.5
plt.plot(x_values, decision_boundary, color='red', label='Decision Boundary')
plt.title('Logistic Regression and Decision Boundary')
plt.xlabel('X')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)
plt.show()

# 確率分布の可視化
x = np.linspace(min(data.min(), poisson_data.min(), gamma_data.min()), 
                max(data.max(), poisson_data.max(), gamma_data.max()), 100)

plt.figure(figsize=(12, 8))

# 正規分布のヒストグラムとPDF
plt.subplot(3, 1, 1)
plt.hist(data, bins=20, alpha=0.6, color='blue', edgecolor='black', density=True, label='Normal Data')
plt.plot(x, stats.norm.pdf(x, loc=true_mean, scale=true_std_dev), color='red', label='Normal PDF')
plt.title('Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

# ポアソン分布のヒストグラムとPMF
plt.subplot(3, 1, 2)
plt.hist(poisson_data, bins=20, alpha=0.6, color='green', edgecolor='black', density=True, label='Poisson Data')
plt.plot(x, stats.poisson.pmf(np.floor(x).astype(int), mu=lambda_poisson), color='red', label='Poisson PMF')
plt.title('Poisson Distribution')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)

# ガンマ分布のヒストグラムとPDF
plt.subplot(3, 1, 3)
plt.hist(gamma_data, bins=20, alpha=0.6, color='purple', edgecolor='black', density=True, label='Gamma Data')
plt.plot(x, stats.gamma.pdf(x, a=k_gamma, scale=theta_gamma), color='red', label='Gamma PDF')
plt.title('Gamma Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# データ生成
np.random.seed(0)
n = 100
X = np.random.uniform(-10, 10, size=(n, 1))
y = (X.flatten() > 0).astype(int)  # Xが0より大きい場合に1、それ以外は0

# ロジスティック回帰モデルの訓練
model = LogisticRegression()
model.fit(X, y)

# 予測値と確率
y_pred_proba = model.predict_proba(X)[:, 1]  # クラス1の確率
y_pred = model.predict(X)

# 対数オッズ比の計算
def log_odds(p):
    return np.log(p / (1 - p))

log_odds_pred = log_odds(y_pred_proba)

# ピアソン残差の計算
def pearson_residuals(y, y_pred_proba):
    return (y - y_pred_proba) / np.sqrt(y_pred_proba * (1 - y_pred_proba))

pearson_residuals_values = pearson_residuals(y, y_pred_proba)

# Deviance残差の計算
def deviance_residuals(y, y_pred_proba):
    return np.sign(y - y_pred_proba) * np.sqrt(-2 * (y * np.log(y_pred_proba + 1e-10) + 
                                                    (1 - y) * np.log(1 - y_pred_proba + 1e-10)))

deviance_residuals_values = deviance_residuals(y, y_pred_proba)

# 交差エントロピー誤差の計算
cross_entropy_error = log_loss(y, y_pred_proba)

# 結果の表示
print(f'対数オッズ比 (Log Odds) の平均: {np.mean(log_odds_pred):.2f}')
print(f'ピアソン残差 (Pearson Residuals) の平均: {np.mean(pearson_residuals_values):.2f}')
print(f'Deviance残差 (Deviance Residuals) の平均: {np.mean(deviance_residuals_values):.2f}')
print(f'交差エントロピー誤差 (Cross-Entropy Error): {cross_entropy_error:.2f}')

# データと予測の可視化
plt.figure(figsize=(12, 8))

# 元データと予測確率のプロット
plt.subplot(3, 1, 1)
plt.scatter(X, y, alpha=0.6, color='blue', label='Data')
plt.plot(X, y_pred_proba, color='red', label='Predicted Probability')
plt.title('Data and Predicted Probabilities')
plt.xlabel('X')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)

# ピアソン残差のプロット
plt.subplot(3, 1, 2)
plt.scatter(X, pearson_residuals_values, alpha=0.6, color='green')
plt.title('Pearson Residuals')
plt.xlabel('X')
plt.ylabel('Pearson Residual')
plt.grid(True)

# Deviance残差のプロット
plt.subplot(3, 1, 3)
plt.scatter(X, deviance_residuals_values, alpha=0.6, color='purple')
plt.title('Deviance Residuals')
plt.xlabel('X')
plt.ylabel('Deviance Residual')
plt.grid(True)

plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import PoissonRegressor, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# データ生成
np.random.seed(0)
n = 200
X = np.random.uniform(-5, 5, size=(n, 1))
true_coef = 2
y = np.random.poisson(lam=np.exp(true_coef * X.flatten()), size=n)  # ポアソン分布に従うデータ

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# ポアソン回帰モデルの訓練と予測
poisson_model = PoissonRegressor(alpha=0.0)  # 正則化項なし
poisson_model.fit(X_train, y_train)
y_pred_poisson = poisson_model.predict(X_test)

# リッジ回帰の訓練と予測
ridge_model = Ridge(alpha=1.0)  # リッジ回帰
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# ラッソ回帰の訓練と予測
lasso_model = Lasso(alpha=0.1)  # ラッソ回帰
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

# モデルの評価
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return mse

mse_poisson = evaluate_model(y_test, y_pred_poisson)
mse_ridge = evaluate_model(y_test, y_pred_ridge)
mse_lasso = evaluate_model(y_test, y_pred_lasso)

print(f'ポアソン回帰の平均二乗誤差 (MSE): {mse_poisson:.2f}')
print(f'リッジ回帰の平均二乗誤差 (MSE): {mse_ridge:.2f}')
print(f'ラッソ回帰の平均二乗誤差 (MSE): {mse_lasso:.2f}')

# プロット
plt.figure(figsize=(12, 8))

# ポアソン回帰のプロット
plt.subplot(3, 1, 1)
plt.scatter(X_test, y_test, alpha=0.6, color='blue', label='Actual Data')
plt.plot(X_test, y_pred_poisson, color='red', label='Poisson Regression')
plt.title('Poisson Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.grid(True)

# リッジ回帰のプロット
plt.subplot(3, 1, 2)
plt.scatter(X_test, y_test, alpha=0.6, color='blue', label='Actual Data')
plt.plot(X_test, y_pred_ridge, color='green', label='Ridge Regression')
plt.title('Ridge Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.grid(True)

# ラッソ回帰のプロット
plt.subplot(3, 1, 3)
plt.scatter(X_test, y_test, alpha=0.6, color='blue', label='Actual Data')
plt.plot(X_test, y_pred_lasso, color='purple', label='Lasso Regression')
plt.title('Lasso Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()



import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.datasets import make_regression

# データ生成
np.random.seed(0)
X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=0)

# リッジ回帰の異なるアルファ値でモデルを訓練
alphas = [0.01, 0.1, 1, 10, 100]
coefficients = []

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X, y)
    coefficients.append(ridge_model.coef_)

# 係数のプロット
plt.figure(figsize=(12, 8))

for i, alpha in enumerate(alphas):
    plt.plot(coefficients[i], label=f'Alpha = {alpha}')

plt.title('Ridge Regression Coefficients for Different Alpha Values')
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.legend()
plt.grid(True)
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler

# データ生成
np.random.seed(0)
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_clusters_per_class=1, n_redundant=0, flip_y=0.1, random_state=0)

# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# パーセプトロンモデルの訓練(L2正則化付き)
class SimplePerceptronWithL2:
    def __init__(self, alpha=0.01, max_iter=1000):
        self.alpha = alpha  # 学習率
        self.max_iter = max_iter  # 最大イテレーション数

    def fit(self, X, y):
        # 初期化
        self.weights = np.zeros(X.shape[1])
        self.bias = 0
        self.cost_history = []

        for _ in range(self.max_iter):
            for xi, target in zip(X, y):
                update = self.alpha * (target - self.predict(xi))
                self.weights += update * xi
                self.bias += update

            # L2正則化
            self.weights -= self.alpha * self.weights

            # 損失の計算
            cost = np.sum((self.predict(X) - y) ** 2) + self.alpha * np.sum(self.weights ** 2)
            self.cost_history.append(cost)

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output >= 0.0, 1, 0)

# モデルの訓練
perceptron = SimplePerceptronWithL2(alpha=0.01, max_iter=1000)
perceptron.fit(X_scaled, y)

# 予測
y_pred = perceptron.predict(X_scaled)

# 正則化の影響の可視化
plt.figure(figsize=(12, 6))

# データのプロット
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='bwr', edgecolor='k', s=50)
plt.title('Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# 決定境界のプロット
plt.subplot(1, 2, 2)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='bwr', edgecolor='k', s=50, alpha=0.7)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 100), np.linspace(ylim[0], ylim[1], 100))
Z = perceptron.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap='bwr', levels=[-1, 0, 1])
plt.title('Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.tight_layout()
plt.show()

# 結果の表示
print(f'Weights: {perceptron.weights}')
print(f'Bias: {perceptron.bias}')
print(f'Final Cost: {perceptron.cost_history[-1]}')




import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# データ生成
np.random.seed(0)
X, y = make_regression(n_samples=200, n_features=1, noise=0.1, random_state=0)

# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = y  # 回帰の場合、標準化する必要はありませんが、ここではそのまま使用します

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=0)

# ニューラルネットワークによる回帰モデルの訓練
nn_regressor = MLPRegressor(hidden_layer_sizes=(10,), activation='relu', max_iter=1000, random_state=0)
nn_regressor.fit(X_train, y_train)

# ロジスティック回帰モデルの訓練(バイナリデータのためにデータを変更)
y_binary = (y > np.median(y)).astype(int)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X_scaled, y_binary, test_size=0.3, random_state=0)
logistic_model = LogisticRegression()
logistic_model.fit(X_train_bin, y_train_bin)

# 予測
y_pred_nn = nn_regressor.predict(X_test)
y_pred_logistic = logistic_model.predict(X_test_bin)

# 結果の可視化
plt.figure(figsize=(12, 6))

# ニューラルネットワークによる回帰のプロット
plt.subplot(1, 2, 1)
plt.scatter(X_scaled, y, color='blue', label='Data')
plt.plot(X_test, y_pred_nn, color='red', label='NN Regressor')
plt.title('Neural Network Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.grid(True)

# ロジスティック回帰のプロット
plt.subplot(1, 2, 2)
plt.scatter(X_scaled, y_binary, color='blue', label='Data')
plt.plot(X_test_bin, logistic_model.predict_proba(X_test_bin)[:, 1], color='red', label='Logistic Regression')
plt.title('Logistic Regression')
plt.xlabel('Feature')
plt.ylabel('Probability')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# 結果の表示
print(f'ニューラルネットワークの訓練スコア: {nn_regressor.score(X_train, y_train):.2f}')
print(f'ニューラルネットワークのテストスコア: {nn_regressor.score(X_test, y_test):.2f}')
print(f'ロジスティック回帰の訓練スコア: {logistic_model.score(X_train_bin, y_train_bin):.2f}')
print(f'ロジスティック回帰のテストスコア: {logistic_model.score(X_test_bin, y_test_bin):.2f}')


import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# データ生成
np.random.seed(0)
X, y = make_classification(n_samples=200, n_features=2, n_informative=2, n_clusters_per_class=1, n_redundant=0, flip_y=0.1, random_state=0)

# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=0)

# ロジスティック回帰モデルの訓練
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# 予測
y_pred_prob = logistic_model.predict_proba(X_test)[:, 1]  # ポジティブクラスの確率
y_pred = logistic_model.predict(X_test)  # クラス予測

# 対数オッズ比(ロジット)の計算
logit = np.log(y_pred_prob / (1 - y_pred_prob))

# プロット
plt.figure(figsize=(12, 6))

# 決定境界のプロット
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='bwr', edgecolor='k', s=50, alpha=0.7)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 100), np.linspace(ylim[0], ylim[1], 100))
Z = logistic_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap='bwr', levels=[0, 0.5, 1])
plt.title('Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# 対数オッズ比のプロット
plt.subplot(1, 2, 2)
plt.scatter(y_pred_prob, logit, c=y_test, cmap='bwr', edgecolor='k', s=50, alpha=0.7)
plt.title('Logit vs Predicted Probability')
plt.xlabel('Predicted Probability')
plt.ylabel('Logit')
plt.grid(True)

plt.tight_layout()
plt.show()

# 結果の表示
print(f'訓練データスコア: {logistic_model.score(X_train, y_train):.2f}')
print(f'テストデータスコア: {logistic_model.score(X_test, y_test):.2f}')
print(f'ロジスティック回帰係数: {logistic_model.coef_}')
print(f'ロジスティック回帰切片: {logistic_model.intercept_}')


import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# データ生成
np.random.seed(0)
X, y = make_regression(n_samples=100, n_features=2, noise=0.1, random_state=0)

# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=0)

# リッジ回帰のモデル訓練
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# ラッソ回帰のモデル訓練
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# 予測
y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)

# プロット
plt.figure(figsize=(14, 7))

# リッジ回帰のプロット
plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], y, color='blue', label='Data')
plt.plot(X_test[:, 0], y_pred_ridge, color='red', label='Ridge Prediction')
plt.title('Ridge Regression')
plt.xlabel('Feature 1')
plt.ylabel('Target')
plt.legend()
plt.grid(True)

# ラッソ回帰のプロット
plt.subplot(1, 2, 2)
plt.scatter(X_scaled[:, 0], y, color='blue', label='Data')
plt.plot(X_test[:, 0], y_pred_lasso, color='red', label='Lasso Prediction')
plt.title('Lasso Regression')
plt.xlabel('Feature 1')
plt.ylabel('Target')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# 結果の表示
print(f'Ridge回帰の係数: {ridge_model.coef_}')
print(f'Lasso回帰の係数: {lasso_model.coef_}')
print(f'Ridge回帰の訓練スコア: {ridge_model.score(X_train, y_train):.2f}')
print(f'Lasso回帰の訓練スコア: {lasso_model.score(X_train, y_train):.2f}')
print(f'Ridge回帰のテストスコア: {ridge_model.score(X_test, y_test):.2f}')
print(f'Lasso回帰のテストスコア: {lasso_model.score(X_test, y_test):.2f}')


import numpy as np
import scipy.stats as stats

# 母集団のパラメータ(仮定)
population_mean = 50  # 母平均
population_std_dev = 10  # 母標準偏差

# 標本データの生成
np.random.seed(0)  # 再現性のためのシード設定
sample_size = 30  # 標本の大きさ
sample = np.random.normal(loc=population_mean, scale=population_std_dev, size=sample_size)

# 標本平均と標本標準偏差の計算
sample_mean = np.mean(sample)
sample_std_dev = np.std(sample, ddof=1)  # ddof=1 で標本標準偏差を計算

# 信頼度と信頼区間の計算
confidence_level = 0.95  # 95% 信頼区間
z_score = stats.norm.ppf((1 + confidence_level) / 2)  # Zスコアの取得

# 標本平均の標準誤差の計算
standard_error = sample_std_dev / np.sqrt(sample_size)

# 信頼区間の計算
margin_of_error = z_score * standard_error
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

# 結果の表示
print(f"母平均: {population_mean}")
print(f"母標準偏差: {population_std_dev}")
print(f"標本の大きさ: {sample_size}")
print(f"標本平均: {sample_mean:.2f}")
print(f"標本標準偏差: {sample_std_dev:.2f}")
print(f"信頼度: {confidence_level * 100}%")
print(f"信頼区間: {confidence_interval[0]:.2f} から {confidence_interval[1]:.2f}")



import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom, norm

# 二項分布のパラメータ
n = 20  # 試行回数
p = 0.5  # 成功の確率

# 二項分布の平均、分散、標準偏差
binom_mean = n * p
binom_variance = n * p * (1 - p)
binom_std_dev = np.sqrt(binom_variance)

# 二項分布の確率質量関数 (PMF)
x = np.arange(0, n + 1)
binom_pmf = binom.pmf(x, n, p)

# 正規分布のパラメータ
norm_mean = binom_mean
norm_std_dev = binom_std_dev

# 正規分布の確率密度関数 (PDF)
x_norm = np.linspace(norm_mean - 4 * norm_std_dev, norm_mean + 4 * norm_std_dev, 1000)
norm_pdf = norm.pdf(x_norm, norm_mean, norm_std_dev)

# プロット
plt.figure(figsize=(14, 6))

# 二項分布のプロット
plt.subplot(1, 2, 1)
plt.bar(x, binom_pmf, color='skyblue', edgecolor='black')
plt.title('Binomial Distribution PMF')
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.grid(axis='y')

# 正規分布のプロット
plt.subplot(1, 2, 2)
plt.plot(x_norm, norm_pdf, color='darkblue')
plt.title('Normal Distribution PDF')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.grid(True)

plt.tight_layout()
plt.show()

# 結果の表示
print(f"二項分布の平均: {binom_mean:.2f}")
print(f"二項分布の分散: {binom_variance:.2f}")
print(f"二項分布の標準偏差: {binom_std_dev:.2f}")
print(f"正規分布の平均: {norm_mean:.2f}")
print(f"正規分布の標準偏差: {norm_std_dev:.2f}")


import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal

# 2次元正規分布のパラメータ
mean = [0, 0]  # 平均ベクトル
covariance = [[1, 0.8], [0.8, 1]]  # 共分散行列

# 同時分布の確率密度関数 (PDF)
x, y = np.mgrid[-3:3:.01, -3:3:.01]
pos = np.dstack((x, y))
rv = multivariate_normal(mean, covariance)
pdf = rv.pdf(pos)

# 周辺分布の計算(X軸方向とY軸方向のマージナル)
x_marginal = np.linspace(-3, 3, 100)
y_marginal = np.linspace(-3, 3, 100)
x_pdf = np.array([rv.pdf(np.array([[xi, yi] for yi in y_marginal])) for xi in x_marginal]).mean(axis=1)
y_pdf = np.array([rv.pdf(np.array([[xi, yi] for xi in x_marginal])) for yi in y_marginal]).mean(axis=0)

# 期待値の計算
expected_value = np.mean(mean)

# プロット
plt.figure(figsize=(18, 6))

# 同時分布のプロット
plt.subplot(1, 3, 1)
plt.contourf(x, y, pdf, cmap='viridis', levels=50)
plt.title('Joint Distribution')
plt.xlabel('X')
plt.ylabel('Y')
plt.colorbar(label='Probability Density')

# 周辺分布 Xのプロット
plt.subplot(1, 3, 2)
plt.plot(x_marginal, x_pdf, color='blue')
plt.title('Marginal Distribution (X)')
plt.xlabel('X')
plt.ylabel('Probability Density')

# 周辺分布 Yのプロット
plt.subplot(1, 3, 3)
plt.plot(y_marginal, y_pdf, color='green')
plt.title('Marginal Distribution (Y)')
plt.xlabel('Y')
plt.ylabel('Probability Density')

plt.tight_layout()
plt.show()

# 結果の表示
print(f"期待値 (X, Y): {mean}")
print(f"期待値のスカラー値: {expected_value:.2f}")




import numpy as np
import matplotlib.pyplot as plt

# Define the discrete probability distribution (fair die)
# Values of the random variable and their probabilities
values = np.arange(1, 7)  # Die faces: 1, 2, 3, 4, 5, 6
probabilities = np.ones(6) / 6  # Equal probability for each face (fair die)

# Calculate expected value, variance, and standard deviation
expected_value = np.sum(values * probabilities)
variance = np.sum((values - expected_value)**2 * probabilities)
standard_deviation = np.sqrt(variance)

# Create the probability distribution table
distribution_table = np.vstack((values, probabilities)).T
print("Probability Distribution Table:")
print("Value\tProbability")
print(distribution_table)

# Plotting
plt.figure(figsize=(12, 6))

# Probability distribution plot
plt.subplot(1, 2, 1)
plt.bar(values, probabilities, color='skyblue', edgecolor='black')
plt.title('Probability Distribution')
plt.xlabel('Value')
plt.ylabel('Probability')

# Plot for expected value and standard deviation
plt.subplot(1, 2, 2)
plt.bar(values, probabilities, color='skyblue', edgecolor='black', alpha=0.6)
plt.axvline(expected_value, color='red', linestyle='--', label=f'Expected Value = {expected_value:.2f}')
plt.axvline(expected_value + standard_deviation, color='green', linestyle='--', label=f'Expected Value + Std Dev = {expected_value + standard_deviation:.2f}')
plt.axvline(expected_value - standard_deviation, color='green', linestyle='--', label=f'Expected Value - Std Dev = {expected_value - standard_deviation:.2f}')
plt.title('Expected Value and Standard Deviation')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.legend()

plt.tight_layout()
plt.show()

# Display results
print(f"Expected Value: {expected_value:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {standard_deviation:.2f}")


import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# データの生成
np.random.seed(42)
data = np.random.normal(loc=5, scale=2, size=100)  # 平均5、標準偏差2の正規分布からサンプルデータを生成

# 推定量と推定値
sample_mean = np.mean(data)
sample_variance = np.var(data, ddof=1)  # 不偏分散の計算
print("Sample Mean:", sample_mean)
print("Unbiased Sample Variance:", sample_variance)

# 雑音正規分布
noise_mean = 0
noise_std = 1
noise = np.random.normal(noise_mean, noise_std, size=100)
noisy_data = data + noise

# 最小二乗推定法
# 例として、単純な線形回帰モデルの最小二乗法を使用
X = np.vstack((np.ones(len(noisy_data)), np.arange(len(noisy_data)))).T
y = noisy_data
theta_ls = np.linalg.lstsq(X, y, rcond=None)[0]
print("\nLeast Squares Estimates:")
print("Intercept:", theta_ls[0])
print("Slope:", theta_ls[1])

# 最尤推定 (正規分布パラメータの推定)
mle_mean = np.mean(data)
mle_variance = np.var(data, ddof=0)  # 標本分散を計算
print("\nMaximum Likelihood Estimates:")
print("Mean:", mle_mean)
print("Variance:", mle_variance)

# ベイズ推定 (正規分布の事前分布と事後分布を使用)
# ここでは、事前分布として正規分布を仮定し、事後分布を計算
prior_mean = 5
prior_variance = 1
likelihood_variance = np.var(noisy_data, ddof=0)

posterior_variance = 1 / (1/prior_variance + len(noisy_data)/likelihood_variance)
posterior_mean = posterior_variance * (prior_mean/prior_variance + np.sum(noisy_data)/likelihood_variance)
print("\nBayesian Estimates:")
print("Posterior Mean:", posterior_mean)
print("Posterior Variance:", posterior_variance)

# データと推定値のプロット
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.hist(data, bins=20, alpha=0.7, color='blue', edgecolor='black')
plt.title('Histogram of Data')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(noisy_data, bins=20, alpha=0.7, color='red', edgecolor='black')
plt.title('Histogram of Noisy Data')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(2, 2, 3)
plt.scatter(np.arange(len(noisy_data)), noisy_data, label='Noisy Data')
plt.plot(np.arange(len(noisy_data)), X @ theta_ls, color='green', label='Least Squares Fit')
plt.title('Least Squares Fit')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend()

plt.subplot(2, 2, 4)
x = np.linspace(np.min(data) - 1, np.max(data) + 1, 100)
pdf = stats.norm.pdf(x, loc=mle_mean, scale=np.sqrt(mle_variance))
plt.plot(x, pdf, color='blue', label='MLE Fit')
plt.hist(data, bins=20, density=True, alpha=0.5, color='red', edgecolor='black', label='Data Histogram')
plt.title('Maximum Likelihood Estimate Fit')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()

plt.tight_layout()
plt.show()

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# 1. 観測データの生成
np.random.seed(42)
data_size = 100
X = np.random.normal(loc=0, scale=1, size=(data_size, 2))  # 2次元データ
y = (X[:, 0] + X[:, 1] > 0).astype(int)  # クラスラベル(1 if x1 + x2 > 0 else 0)

# 2. 仮説検定
# 帰無仮説: Xの平均が0であるという仮説
mean, var = np.mean(X, axis=0), np.var(X, axis=0)
z_scores = mean / np.sqrt(var / data_size)
p_values = 2 * (1 - stats.norm.cdf(np.abs(z_scores)))
print("\nHypothesis Testing:")
print("Z-scores:", z_scores)
print("P-values:", p_values)

# 3. 標準正規分布のプロット
x = np.linspace(-4, 4, 100)
pdf = stats.norm.pdf(x)
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label='Standard Normal Distribution')
plt.title('Standard Normal Distribution')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.grid(True)
plt.legend()
plt.show()

# 4. ニューラルネットワークの学習と予測
# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# MLPニューラルネットワークの訓練
clf = MLPClassifier(hidden_layer_sizes=(5,), max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# 予測と精度評価
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nNeural Network Accuracy:", accuracy)

# 5. ベイズ学習とベイズ予測
# ベイズ推定の簡単な例として、Bernoulli分布を使用
from scipy.stats import bernoulli

# ベイズ予測(ベータ分布の事前を使用)
alpha, beta = 2, 2  # ベータ分布のパラメータ
data = np.random.binomial(1, p=0.6, size=100)  # サンプルデータ
posterior_alpha = alpha + np.sum(data)
posterior_beta = beta + len(data) - np.sum(data)
x = np.linspace(0, 1, 100)
pdf_beta = stats.beta.pdf(x, posterior_alpha, posterior_beta)

plt.figure(figsize=(12, 6))
plt.plot(x, pdf_beta, label='Posterior Beta Distribution')
plt.title('Posterior Distribution after Bayesian Learning')
plt.xlabel('Probability')
plt.ylabel('Density')
plt.grid(True)
plt.legend()
plt.show()

# ベイズ予測の例として、データの平均と標準偏差を使用
mean_post = posterior_alpha / (posterior_alpha + posterior_beta)
std_post = np.sqrt(posterior_alpha * posterior_beta / ((posterior_alpha + posterior_beta)**2 * (posterior_alpha + posterior_beta + 1)))
print("\nBayesian Prediction:")
print("Posterior Mean:", mean_post)
print("Posterior Std Dev:", std_post)

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from hmmlearn import hmm

# 1. 確率過程と予測
np.random.seed(42)
n_samples = 100
t = np.arange(n_samples)
true_mean = 5
true_std = 2
process = true_mean + true_std * np.random.randn(n_samples)

# 2. 尤度関数と最尤推定
# 最尤推定(正規分布のパラメータ推定)
def likelihood_function(mu, sigma, data):
    return np.prod(stats.norm.pdf(data, mu, sigma))

# パラメータ推定
mu_mle = np.mean(process)
sigma_mle = np.std(process, ddof=0)
print("\nMLE Parameters:")
print("Mean:", mu_mle)
print("Standard Deviation:", sigma_mle)

# 3. 相関係数と共分散
correlation_matrix = np.corrcoef(process[:-1], process[1:])
covariance_matrix = np.cov(process[:-1], process[1:])
print("\nCorrelation Coefficient Matrix:")
print(correlation_matrix)
print("\nCovariance Matrix:")
print(covariance_matrix)

# 4. 不偏分散と不偏共分散
unbiased_variance = np.var(process, ddof=1)
print("\nUnbiased Variance:", unbiased_variance)

# 5. 自己共分散と自己相関
auto_covariance = np.correlate(process, process, mode='full') / n_samples
auto_correlation = auto_covariance / auto_covariance[n_samples - 1]
lags = np.arange(-n_samples + 1, n_samples)
print("\nAutocovariance (first 5 values):", auto_covariance[:5])
print("Autocorrelation (first 5 values):", auto_correlation[:5])

# 6. 定常過程と雑音
stationary_process = np.cumsum(np.random.randn(n_samples))
noise = np.random.normal(0, 1, size=n_samples)

# 7. ベイズ予測
# ここでは、単純なベータ分布を使用してベイズ予測を行います
alpha, beta = 2, 2
data = np.random.binomial(1, p=0.6, size=100)
posterior_alpha = alpha + np.sum(data)
posterior_beta = beta + len(data) - np.sum(data)
x = np.linspace(0, 1, 100)
pdf_beta = stats.beta.pdf(x, posterior_alpha, posterior_beta)

plt.figure(figsize=(12, 6))
plt.plot(x, pdf_beta, label='Posterior Beta Distribution')
plt.title('Posterior Distribution after Bayesian Learning')
plt.xlabel('Probability')
plt.ylabel('Density')
plt.grid(True)
plt.legend()
plt.show()

# 8. マルコフ過程と逐次ベイズ予測
# 隠れマルコフモデルの例
model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=1000)
model.fit(process.reshape(-1, 1))

# 逐次ベイズ予測
predicted_states = model.predict(process.reshape(-1, 1))
print("\nHidden Markov Model States (first 10 predictions):", predicted_states[:10])

# 隠れマルコフ過程の生成とプロット
hidden_states = np.zeros(n_samples)
observations = np.zeros(n_samples)
for i in range(1, n_samples):
    hidden_states[i] = np.random.choice([0, 1], p=model.transmat_[int(hidden_states[i-1])])
    observations[i] = np.random.normal(model.means_[int(hidden_states[i]), 0], np.sqrt(model.covars_[int(hidden_states[i]), 0]))

plt.figure(figsize=(12, 6))
plt.plot(t, observations, label='Generated Observations')
plt.title('Hidden Markov Process Observations')
plt.xlabel('Time')
plt.ylabel('Value')
plt.grid(True)
plt.legend()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Generate a random dataset
np.random.seed(42)
data = np.random.normal(loc=50, scale=10, size=1000)

# Calculate descriptive statistics
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)[0][0]
variance = np.var(data)
std_deviation = np.std(data)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {std_deviation}")

# Plotting the data
plt.hist(data, bins=30, edgecolor='k', alpha=0.7)
plt.axvline(mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {mean:.2f}')
plt.axvline(median, color='g', linestyle='dashed', linewidth=1, label=f'Median: {median:.2f}')
plt.axvline(mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode: {mode:.2f}')
plt.legend()
plt.title('Histogram of Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

# Central Limit Theorem demonstration
sample_means = []
sample_size = 30
num_samples = 1000

for _ in range(num_samples):
    sample = np.random.choice(data, size=sample_size, replace=True)
    sample_means.append(np.mean(sample))

# Plotting the sampling distribution of the sample mean
plt.hist(sample_means, bins=30, edgecolor='k', alpha=0.7, density=True)
mu = np.mean(sample_means)
sigma = np.std(sample_means)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma), 'r', linewidth=2, label='Normal Distribution')
plt.legend()
plt.title('Sampling Distribution of the Sample Mean')
plt.xlabel('Sample Mean')
plt.ylabel('Density')
plt.show()

# Standard Error calculation
standard_error = std_deviation / np.sqrt(sample_size)
print(f"Standard Error: {standard_error}")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Generate synthetic dataset
np.random.seed(42)
data = np.random.randn(1000, 3) * [10, 20, 30] + [50, 100, 150]
df = pd.DataFrame(data, columns=['Feature1', 'Feature2', 'Feature3'])

# Add some outliers
df.iloc[0] = [500, -500, 300]
df.iloc[1] = [-500, 500, -300]

# Add some missing values
df.iloc[2:5, 0] = np.nan

# 2.1 Histograms, Scatter Plots, and Box Plots
plt.figure(figsize=(14, 6))

plt.subplot(1, 3, 1)
plt.hist(df['Feature1'].dropna(), bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Feature1')

plt.subplot(1, 3, 2)
plt.scatter(df['Feature1'], df['Feature2'], alpha=0.7)
plt.title('Scatter Plot between Feature1 and Feature2')

plt.subplot(1, 3, 3)
sns.boxplot(data=df[['Feature1', 'Feature2', 'Feature3']], orient='h')
plt.title('Box Plot of Features')

plt.tight_layout()
plt.show()

# 2.2 Outlier Detection and Handling
# Using IQR method for outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)
df_no_outliers = df[~outliers]

# 2.3 Normalization and Standardization
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

df_minmax_scaled = scaler_minmax.fit_transform(df.fillna(df.mean()))
df_standard_scaled = scaler_standard.fit_transform(df.fillna(df.mean()))

# 2.4 Handling Missing Values
# Filling missing values with mean
df_filled = df.fillna(df.mean())

# Plotting the results
plt.figure(figsize=(14, 6))

plt.subplot(1, 3, 1)
plt.hist(df_filled['Feature1'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Feature1 after Filling Missing Values')

plt.subplot(1, 3, 2)
sns.boxplot(data=pd.DataFrame(df_minmax_scaled, columns=['Feature1', 'Feature2', 'Feature3']), orient='h')
plt.title('Box Plot of Normalized Features')

plt.subplot(1, 3, 3)
sns.boxplot(data=pd.DataFrame(df_standard_scaled, columns=['Feature1', 'Feature2', 'Feature3']), orient='h')
plt.title('Box Plot of Standardized Features')

plt.tight_layout()
plt.show()


import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Generate synthetic dataset
np.random.seed(42)
data = np.random.randn(100, 2)
df = pd.DataFrame(data, columns=['Variable1', 'Variable2'])

# 3.1 Point Estimation and Interval Estimation
mean_estimate = df['Variable1'].mean()
std_dev = df['Variable1'].std()
confidence_interval = stats.norm.interval(0.95, loc=mean_estimate, scale=std_dev/np.sqrt(len(df)))

print("Point Estimate (Mean):", mean_estimate)
print("95% Confidence Interval:", confidence_interval)

# 3.2 Hypothesis Testing
# Null Hypothesis: The mean of Variable1 is 0
t_statistic, p_value = stats.ttest_1samp(df['Variable1'], 0)
print("t-Statistic:", t_statistic)
print("p-Value:", p_value)

# 3.3 t-Test, Chi-Square Test, F-Test
# Independent t-Test
group1 = df['Variable1'][:50]
group2 = df['Variable1'][50:]
t_statistic_ind, p_value_ind = stats.ttest_ind(group1, group2)
print("Independent t-Test t-Statistic:", t_statistic_ind)
print("Independent t-Test p-Value:", p_value_ind)

# Chi-Square Test
observed = pd.Series([20, 30, 50])
expected = pd.Series([25, 25, 50])
chi2_statistic, chi2_p_value = stats.chisquare(f_obs=observed, f_exp=expected)
print("Chi-Square Statistic:", chi2_statistic)
print("Chi-Square p-Value:", chi2_p_value)

# F-Test (ANOVA)
anova_statistic, anova_p_value = stats.f_oneway(group1, group2)
print("F-Test (ANOVA) Statistic:", anova_statistic)
print("F-Test (ANOVA) p-Value:", anova_p_value)

# 3.4 Correlation and Causation
# Pearson's correlation coefficient
pearson_corr, _ = stats.pearsonr(df['Variable1'], df['Variable2'])
print("Pearson's Correlation Coefficient:", pearson_corr)

# Spearman's rank correlation coefficient
spearman_corr, _ = stats.spearmanr(df['Variable1'], df['Variable2'])
print("Spearman's Rank Correlation Coefficient:", spearman_corr)

# Plotting the results
plt.figure(figsize=(14, 6))

# Histogram of Variable1
plt.subplot(1, 3, 1)
plt.hist(df['Variable1'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Variable1')

# Scatter plot between Variable1 and Variable2
plt.subplot(1, 3, 2)
plt.scatter(df['Variable1'], df['Variable2'], alpha=0.7)
plt.title('Scatter Plot between Variable1 and Variable2')

# Box plot of Variable1 grouped by a binary group
plt.subplot(1, 3, 3)
sns.boxplot(data=[group1, group2])
plt.title('Box Plot of Variable1')

plt.tight_layout()
plt.show()

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Generate synthetic dataset
np.random.seed(42)
X = np.random.rand(100, 2)
y = 3 * X[:, 0] + 2 * X[:, 1] + np.random.randn(100)
df = pd.DataFrame(X, columns=['Feature1', 'Feature2'])
df['Target'] = y

# 4.1 Simple Linear Regression
X_simple = df[['Feature1']]
y_simple = df['Target']
model_simple = LinearRegression().fit(X_simple, y_simple)
y_pred_simple = model_simple.predict(X_simple)
r2_simple = r2_score(y_simple, y_pred_simple)

print("Simple Linear Regression Coefficients:", model_simple.coef_)
print("Simple Linear Regression Intercept:", model_simple.intercept_)
print("Simple Linear Regression R²:", r2_simple)

# 4.2 Multiple Linear Regression
X_multiple = df[['Feature1', 'Feature2']]
model_multiple = LinearRegression().fit(X_multiple, df['Target'])
y_pred_multiple = model_multiple.predict(X_multiple)
r2_multiple = r2_score(df['Target'], y_pred_multiple)

print("Multiple Linear Regression Coefficients:", model_multiple.coef_)
print("Multiple Linear Regression Intercept:", model_multiple.intercept_)
print("Multiple Linear Regression R²:", r2_multiple)

# Check for multicollinearity using Variance Inflation Factor (VIF)
X_with_constant = sm.add_constant(X_multiple)
vif = pd.Series([sm.OLS(X_with_constant.iloc[:, i], X_with_constant.drop(X_with_constant.columns[i], axis=1)).fit().rsquared for i in range(X_with_constant.shape[1])], index=X_with_constant.columns)
print("VIF:", 1/(1 - vif))

# 4.3 Logistic Regression
# Generate binary target variable
y_binary = (df['Target'] > df['Target'].median()).astype(int)
logistic_model = LogisticRegression().fit(X_multiple, y_binary)
odds_ratios = np.exp(logistic_model.coef_)

print("Logistic Regression Coefficients:", logistic_model.coef_)
print("Logistic Regression Intercept:", logistic_model.intercept_)
print("Odds Ratios:", odds_ratios)

# Plotting the results
plt.figure(figsize=(18, 6))

# Simple Linear Regression plot
plt.subplot(1, 3, 1)
plt.scatter(X_simple, y_simple, alpha=0.7)
plt.plot(X_simple, y_pred_simple, color='red')
plt.title('Simple Linear Regression\nR²: {:.2f}'.format(r2_simple))
plt.xlabel('Feature1')
plt.ylabel('Target')

# Multiple Linear Regression plot
plt.subplot(1, 3, 2)
sns.scatterplot(x='Feature1', y='Target', hue='Feature2', data=df, palette='coolwarm')
plt.plot(df['Feature1'], y_pred_multiple, color='red')
plt.title('Multiple Linear Regression\nR²: {:.2f}'.format(r2_multiple))
plt.xlabel('Feature1')
plt.ylabel('Target')

# Logistic Regression plot
plt.subplot(1, 3, 3)
sns.scatterplot(x='Feature1', y='Feature2', hue=y_binary, palette='coolwarm', data=df)
plt.title('Logistic Regression')
plt.xlabel('Feature1')
plt.ylabel('Feature2')

plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
import statsmodels.api as sm

# データ生成
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# モデルのトレーニング
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 精度の評価
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

# 結果のプロット
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.bar(['Training Accuracy', 'Test Accuracy'], [train_acc, test_acc], color=['blue', 'orange'])
plt.title('Training vs Test Accuracy')
plt.ylabel('Accuracy')

# クロスバリデーション
cv_scores = cross_val_score(model, X, y, cv=5)
plt.subplot(2, 2, 2)
plt.plot(cv_scores, marker='o', linestyle='--')
plt.title('Cross-Validation Scores')
plt.xlabel('Fold')
plt.ylabel('Score')

# AIC、BICの計算
def calculate_aic_bic(model, X, y):
    results = sm.Logit(y, X).fit()
    aic = results.aic
    bic = results.bic
    return aic, bic

aic, bic = calculate_aic_bic(model, X_train, y_train)

plt.subplot(2, 2, 3)
plt.bar(['AIC', 'BIC'], [aic, bic], color=['green', 'red'])
plt.title('AIC vs BIC')
plt.ylabel('Score')

# ROC曲線とAUCの計算
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

plt.subplot(2, 2, 4)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')

plt.tight_layout()
plt.show()

# バイアス・バリアンスのトレードオフ
train_sizes, train_scores, test_scores = learning_curve(
    LogisticRegression(max_iter=1000),
    X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Cross-validation score')
plt.title('Learning Curve (Logistic Regression)')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.grid()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymc3 as pm
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve

# 6.1 条件付き確率
def bayes_theorem(prior, likelihood, marginal):
    return (likelihood * prior) / marginal

# 事前確率、尤度、周辺確率の設定
prior = 0.5
likelihood = 0.8
marginal = 0.7
posterior = bayes_theorem(prior, likelihood, marginal)
print(f"Posterior Probability: {posterior:.2f}")

# 6.2 ベイズ推論
# 事前確率、尤度
prior_dist = np.random.beta(2, 2, 1000)
likelihood_dist = np.random.normal(0.7, 0.1, 1000)
posterior_dist = (likelihood_dist * prior_dist) / (likelihood_dist * prior_dist).sum()

# ベイズ推論のプロット
plt.figure(figsize=(12, 8))
plt.subplot(2, 3, 1)
sns.histplot(prior_dist, color='blue', label='Prior Distribution', kde=True)
sns.histplot(likelihood_dist, color='green', label='Likelihood', kde=True)
sns.histplot(posterior_dist, color='red', label='Posterior Distribution', kde=True)
plt.title('Bayesian Inference Distributions')
plt.legend()

# 6.3 モンテカルロ法
# MCMCサンプリング
with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sigma=1)
    sigma = pm.HalfNormal('sigma', sigma=1)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=np.random.normal(0, 1, 100))
    trace = pm.sample(1000, return_inferencedata=False)

# MCMCサンプリング結果のプロット
plt.subplot(2, 3, 2)
pm.traceplot(trace)
plt.title('MCMC Traceplots')

# 7.1 主成分分析 (PCA)
data = load_iris()
X = StandardScaler().fit_transform(data.data)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# PCA結果のプロット
plt.subplot(2, 3, 3)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data.target, cmap='viridis')
plt.title('PCA of Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Species')

# 7.2 因子分析
fa = FactorAnalysis(n_components=2)
X_fa = fa.fit_transform(X)

# 因子分析結果のプロット
plt.subplot(2, 3, 4)
plt.scatter(X_fa[:, 0], X_fa[:, 1], c=data.target, cmap='viridis')
plt.title('Factor Analysis of Iris Dataset')
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
plt.colorbar(label='Species')

# 7.3 クラスタリング

# K-平均法
kmeans = KMeans(n_clusters=3)
y_kmeans = kmeans.fit_predict(X)

# K-平均法結果のプロット
plt.subplot(2, 3, 5)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_kmeans, cmap='viridis')
plt.title('K-Means Clustering of PCA Results')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')

# 階層的クラスタリング
linked = linkage(X, 'ward')

# デンドログラムのプロット
plt.subplot(2, 3, 6)
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')

plt.tight_layout()
plt.show()

# 7.4 判別分析

# 線形判別分析 (LDA)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, data.target)

# LDA結果のプロット
plt.figure(figsize=(8, 6))
plt.scatter(X_lda[:, 0], X_lda[:, 1], c=data.target, cmap='viridis')
plt.title('LDA of Iris Dataset')
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.colorbar(label='Species')
plt.show()

# 二次判別分析
qda = QuadraticDiscriminantAnalysis()
qda.fit(X, data.target)
qda_pred = qda.predict(X)

# QDAの評価
print(confusion_matrix(data.target, qda_pred))
print(classification_report(data.target, qda_pred))


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mutual_info_score
from scipy.special import xlogy
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.utils import resample
from sklearn.base import clone
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

# 9.1 マルコフ連鎖
def markov_chain(transition_matrix, initial_state, steps):
    state = initial_state
    states = [state]
    for _ in range(steps):
        state = np.random.choice(len(transition_matrix), p=transition_matrix[state])
        states.append(state)
    return states

# 遷移行列と初期状態
transition_matrix = np.array([[0.9, 0.1], [0.5, 0.5]])
initial_state = 0
steps = 100
states = markov_chain(transition_matrix, initial_state, steps)

# マルコフ連鎖のプロット
plt.figure(figsize=(12, 6))
plt.subplot(2, 3, 1)
plt.plot(states, marker='o')
plt.title('Markov Chain')
plt.xlabel('Step')
plt.ylabel('State')

# 9.2 ベイジアンネットワーク(簡単な例)
# ベイジアンネットワークの例としてガウス混合モデルを使用
gmm = GaussianMixture(n_components=2)
X = np.vstack([np.random.normal(0, 1, (100, 2)), np.random.normal(3, 1, (100, 2))])
gmm.fit(X)
labels = gmm.predict(X)

# ベイジアンネットワークのプロット
plt.subplot(2, 3, 2)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
plt.title('Bayesian Network (GMM)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# 9.3 隠れマルコフモデル (HMM)
from hmmlearn.hmm import GaussianHMM

# 隠れマルコフモデルの学習
hmm = GaussianHMM(n_components=2)
hmm.fit(X)
hidden_states = hmm.predict(X)

# 隠れマルコフモデルのプロット
plt.subplot(2, 3, 3)
plt.scatter(X[:, 0], X[:, 1], c=hidden_states, cmap='viridis')
plt.title('Hidden Markov Model (HMM)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# 10.1 情報エントロピー
def entropy(prob_dist):
    return -np.sum(prob_dist * np.log2(prob_dist + 1e-10))

# 確率分布とエントロピー計算
prob_dist = np.array([0.25, 0.25, 0.25, 0.25])
entropy_value = entropy(prob_dist)
print(f"Entropy: {entropy_value:.2f}")

# 10.2 相互情報量
def mutual_information(x, y):
    return mutual_info_score(x, y)

# 相互情報量の計算
mi_value = mutual_information(X[:, 0] > 1, X[:, 1] > 1)
print(f"Mutual Information: {mi_value:.2f}")

# 10.3 クラメロラオ限界(簡単な計算例)
def cramer_rao_bound(sigma, n):
    return sigma**2 / n

# クラメロラオ限界の計算
sigma = 1
n = 100
cr_bound = cramer_rao_bound(sigma, n)
print(f"Cramer-Rao Bound: {cr_bound:.2f}")

# 11.1 Vapnik-Chervonenkis (VC) 次元(簡単な計算例)
def vc_dimension(p, n):
    return min(n, p)

# VC次元の計算
p = 2
vc_dim = vc_dimension(p, n)
print(f"VC Dimension: {vc_dim}")

# 11.2 パックバウンディング(簡単な計算例)
def pac_bounding(n, epsilon):
    return np.sqrt((2 * np.log(2 / epsilon)) / n)

# PACバウンディングの計算
epsilon = 0.1
pac_bound = pac_bounding(n, epsilon)
print(f"PAC Bounding: {pac_bound:.2f}")

# 11.3 一般化誤差境界(簡単な計算例)
def generalization_error_bound(training_error, complexity, n):
    return training_error + complexity / np.sqrt(n)

# 一般化誤差境界の計算
training_error = 0.1
complexity = 1
generalization_bound = generalization_error_bound(training_error, complexity, n)
print(f"Generalization Error Bound: {generalization_bound:.2f}")

# 12.1 ブートストラップ法
def bootstrap(data, n_iterations):
    stats = []
    for _ in range(n_iterations):
        sample = resample(data)
        stat = np.mean(sample)
        stats.append(stat)
    return np.array(stats)

# ブートストラップ法の実行
data = np.random.normal(0, 1, 100)
bootstrap_stats = bootstrap(data, 1000)

# ブートストラップ法のプロット
plt.figure(figsize=(12, 6))
plt.subplot(2, 3, 4)
sns.histplot(bootstrap_stats, kde=True)
plt.title('Bootstrap Distribution')
plt.xlabel('Bootstrap Sample Mean')

# 12.2 ジャックナイフ法
def jackknife(data):
    n = len(data)
    jackknife_samples = np.array([np.mean(np.delete(data, i)) for i in range(n)])
    return jackknife_samples

# ジャックナイフ法の実行
jackknife_stats = jackknife(data)

# ジャックナイフ法のプロット
plt.subplot(2, 3, 5)
sns.histplot(jackknife_stats, kde=True)
plt.title('Jackknife Distribution')
plt.xlabel('Jackknife Sample Mean')

# 12.3 エンピリカルベイズ法
from sklearn.linear_model import BayesianRidge

# エンピリカルベイズ法の学習とプロット
X_train, X_test, y_train, y_test = train_test_split(X, data.target, test_size=0.3, random_state=42)
model = BayesianRidge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, np.round(y_pred))

print(f"Empirical Bayes Model Accuracy: {accuracy:.2f}")

plt.subplot(2, 3, 6)
sns.histplot(y_pred, kde=True)
plt.title('Empirical Bayes Predictions')
plt.xlabel('Predicted Values')

plt.tight_layout()
plt.show()



import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sympy as sp

# Chapter 1: 確率空間と確率変数

# 1.1 確率モデルの考え方
sample_space = ['H', 'T']  # H: Head, T: Tail
probability_model = {'H': 0.5, 'T': 0.5}
print(f'Sample space: {sample_space}')
print(f'Probability model: {probability_model}')

# 1.2 根元事象と確率の割り当て
events = ['H', 'T']
probabilities = [0.5, 0.5]
for event, prob in zip(events, probabilities):
    print(f'Event: {event}, Probability: {prob}')

# 1.3 条件付き確率と独立事象
def conditional_probability(A, B, P):
    return P[A & B] / P[B]

P = {('H', 'H'): 0.25, ('H', 'T'): 0.25, ('T', 'H'): 0.25, ('T', 'T'): 0.25}
A = {('H', 'H'), ('H', 'T')}
B = {('H', 'H'), ('T', 'H')}

P_A_given_B = conditional_probability(A, B, P)
print(f'P(A|B): {P_A_given_B}')

# 独立事象の確認
independent = P_A_given_B == P[A]
print(f'Are A and B independent? {independent}')

# 1.4 確率変数と確率分布
np.random.seed(42)
n = 1000
data = np.random.binomial(n=1, p=0.5, size=n)

plt.hist(data, bins=2, edgecolor='k', alpha=0.7)
plt.xlabel('Outcome')
plt.ylabel('Frequency')
plt.title('Probability Distribution of a Bernoulli Random Variable')
plt.xticks([0, 1], ['T', 'H'])
plt.show()

# 1.5 主要な定理のまとめ
def bayes_theorem(P_A, P_B_given_A, P_B):
    return (P_B_given_A * P_A) / P_B

P_A = 0.01
P_B_given_A = 0.9
P_B = 0.2

P_A_given_B = bayes_theorem(P_A, P_B_given_A, P_B)
print(f'P(A|B): {P_A_given_B}')

# Chapter 6: 多変数関数

# 6.1 多変数関数の微分
x, y = sp.symbols('x y')
f = x**2 + y**2

# 偏微分
f_x = sp.diff(f, x)
f_y = sp.diff(f, y)
print(f'Partial derivative with respect to x: {f_x}')
print(f'Partial derivative with respect to y: {f_y}')

# 全微分
df = f_x * sp.Symbol('dx') + f_y * sp.Symbol('dy')
print(f'Total differential: {df}')

# 高階偏導関数
f_xx = sp.diff(f_x, x)
f_yy = sp.diff(f_y, y)
f_xy = sp.diff(f_x, y)
print(f'Second order partial derivative with respect to x: {f_xx}')
print(f'Second order partial derivative with respect to y: {f_yy}')
print(f'Mixed partial derivative: {f_xy}')

# 多変数関数のテイラーの公式
def taylor_series_multivariable(func, vars, point, order):
    series = func
    for i in range(1, order + 1):
        for var in vars:
            series += (sp.diff(func, var, i).subs([(v, p) for v, p in zip(vars, point)]) / sp.factorial(i)) * (var - point[vars.index(var)])**i
    return series

taylor_f = taylor_series_multivariable(f, [x, y], [0, 0], 2)
print(f'Taylor series expansion: {taylor_f}')

# 6.2 写像の微分
u, v = sp.symbols('u v')
g = sp.Matrix([x + y, x - y])
jacobian_g = g.jacobian([x, y])
print(f'Jacobian matrix of the mapping: {jacobian_g}')

# アフィン変換による写像の近似
affine_approx = jacobian_g.subs([(x, 1), (y, 1)]) * sp.Matrix([u, v]) + sp.Matrix([1, 1])
print(f'Affine approximation at (1, 1): {affine_approx}')

# 6.3 極値問題
# 6.3.1 1変数関数の極値問題
f = x**3 - 3*x**2 + 2
f_prime = sp.diff(f, x)
critical_points = sp.solve(f_prime, x)
second_derivative = sp.diff(f_prime, x)

extrema = []
for point in critical_points:
    if second_derivative.subs(x, point) > 0:
        extrema.append((point, "min"))
    elif second_derivative.subs(x, point) < 0:
        extrema.append((point, "max"))

print(f'Extrema of the function: {extrema}')

# 6.3.2 2変数関数の極値問題
f = x**2 + y**2 - 4*x - 6*y
f_x = sp.diff(f, x)
f_y = sp.diff(f, y)

critical_points = sp.solve([f_x, f_y], (x, y))
second_derivative_xx = sp.diff(f_x, x)
second_derivative_yy = sp.diff(f_y, y)
second_derivative_xy = sp.diff(f_x, y)
hessian = sp.Matrix([[second_derivative_xx, second_derivative_xy], [second_derivative_xy, second_derivative_yy]])

extrema = []
for point in critical_points:
    hessian_at_point = hessian.subs([(x, point[0]), (y, point[1])])
    hessian_det = hessian_at_point.det()
    if hessian_det > 0:
        if second_derivative_xx.subs([(x, point[0]), (y, point[1])]) > 0:
            extrema.append((point, "min"))
        else:
            extrema.append((point, "max"))
    elif hessian_det < 0:
        extrema.append((point, "saddle"))

print(f'Extrema of the function: {extrema}')



import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Chapter 2: 離散型の確率分布

# 2.1 確率変数の期待値と分散
def expected_value(values, probabilities):
    return np.sum(np.array(values) * np.array(probabilities))

def variance(values, probabilities, expected_value):
    return np.sum((np.array(values) - expected_value)**2 * np.array(probabilities))

# 例: サイコロの期待値と分散
values = [1, 2, 3, 4, 5, 6]
probabilities = [1/6] * 6
mean = expected_value(values, probabilities)
var = variance(values, probabilities, mean)

print(f'Expected value (mean): {mean}')
print(f'Variance: {var}')

# 2.2 共分散と相関係数
def covariance(X, Y, P):
    return np.sum((np.array(X) - np.mean(X)) * (np.array(Y) - np.mean(Y)) * np.array(P))

def correlation(X, Y, P):
    cov = covariance(X, Y, P)
    std_X = np.sqrt(variance(X, P, np.mean(X)))
    std_Y = np.sqrt(variance(Y, P, np.mean(Y)))
    return cov / (std_X * std_Y)

# 例: XとYの共分散と相関係数
X = [1, 2, 3, 4, 5]
Y = [2, 4, 6, 8, 10]
P = [0.2] * 5
cov_XY = covariance(X, Y, P)
corr_XY = correlation(X, Y, P)

print(f'Covariance: {cov_XY}')
print(f'Correlation coefficient: {corr_XY}')

# 2.3 主要な離散型確率分布
# 2.3.1 離散一様分布
def plot_uniform_distribution(n):
    x = np.arange(n)
    probabilities = [1/n] * n
    plt.bar(x, probabilities)
    plt.title('Discrete Uniform Distribution')
    plt.xlabel('X')
    plt.ylabel('Probability')
    plt.show()

plot_uniform_distribution(6)

# 2.3.2 ベルヌーイ分布
def plot_bernoulli_distribution(p):
    x = [0, 1]
    probabilities = [1-p, p]
    plt.bar(x, probabilities)
    plt.title(f'Bernoulli Distribution (p={p})')
    plt.xlabel('X')
    plt.ylabel('Probability')
    plt.show()

plot_bernoulli_distribution(0.3)

# 2.3.3 二項分布
def plot_binomial_distribution(n, p):
    x = np.arange(n+1)
    probabilities = stats.binom.pmf(x, n, p)
    plt.bar(x, probabilities)
    plt.title(f'Binomial Distribution (n={n}, p={p})')
    plt.xlabel('X')
    plt.ylabel('Probability')
    plt.show()

plot_binomial_distribution(10, 0.5)

# 2.3.4 ポアソン分布
def plot_poisson_distribution(lam):
    x = np.arange(0, 20)
    probabilities = stats.poisson.pmf(x, lam)
    plt.bar(x, probabilities)
    plt.title(f'Poisson Distribution (lambda={lam})')
    plt.xlabel('X')
    plt.ylabel('Probability')
    plt.show()

plot_poisson_distribution(5)

# 2.4 大数の法則
n = 10000
p = 0.5
samples = np.random.binomial(1, p, size=n)
empirical_mean = np.mean(samples)
theoretical_mean = p
print(f'Empirical mean: {empirical_mean}')
print(f'Theoretical mean: {theoretical_mean}')

# 2.5 主要な定理のまとめ
# コード実装は不要なため省略

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Chapter 3: 連続型の確率分布

# 3.1 連続的確率空間
# 連続型確率分布のプロット
def plot_continuous_distribution(dist, params, x_range):
    x = np.linspace(x_range[0], x_range[1], 1000)
    y = dist.pdf(x, *params)
    plt.plot(x, y, label=f'{type(dist).__name__} Distribution')
    plt.title(f'{type(dist).__name__} Distribution')
    plt.xlabel('x')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.show()

# 3.2 連続型の確率変数の性質
# 例: 正規分布の性質
mu, sigma = 0, 1
dist = stats.norm(mu, sigma)
x_range = [-5, 5]

plot_continuous_distribution(stats.norm, [mu, sigma], x_range)

# 3.3 正規分布の性質
def normal_distribution_statistics(mu, sigma, x):
    return stats.norm.pdf(x, mu, sigma), stats.norm.cdf(x, mu, sigma)

x_values = np.linspace(-5, 5, 1000)
pdf_values, cdf_values = normal_distribution_statistics(mu, sigma, x_values)

plt.plot(x_values, pdf_values, label='PDF')
plt.plot(x_values, cdf_values, label='CDF')
plt.title('Normal Distribution: PDF and CDF')
plt.xlabel('x')
plt.ylabel('Probability')
plt.legend()
plt.show()

# Chapter 4: パラメトリック推定と仮説検定

# 4.1 最尤推定法と不偏推定量
def maximum_likelihood_estimation(data):
    mu, sigma = np.mean(data), np.std(data, ddof=1)
    return mu, sigma

data = np.random.normal(mu, sigma, size=1000)
mu_hat, sigma_hat = maximum_likelihood_estimation(data)
print(f'MLE for mu: {mu_hat}')
print(f'MLE for sigma: {sigma_hat}')

# 4.2 仮説検定の考え方
def hypothesis_test(data, mu_0, alpha=0.05):
    t_stat, p_value = stats.ttest_1samp(data, mu_0)
    reject_null = p_value < alpha
    return t_stat, p_value, reject_null

mu_0 = 0
t_stat, p_value, reject_null = hypothesis_test(data, mu_0)
print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')
print(f'Reject null hypothesis: {reject_null}')

# Appendix A: 機械学習への応用例

# A.1 最小二乗法による回帰分析
X = np.random.rand(100, 1) * 10
y = 3 * X.squeeze() + np.random.randn(100) * 2
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, y_pred, color='red', linewidth=2, label='Fitted line')
plt.title('Least Squares Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

# A.2 ロジスティック回帰による分類アルゴリズム
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=42)
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', label='Data')
plt.title('Logistic Regression Classification')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# A.3 k平均法によるクラスタリング
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4)
y_kmeans = kmeans.fit_predict(X)

plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()


0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0