import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Set random seed for reproducibility
np.random.seed(0)
# 3. Descriptive Statistics
data = np.random.normal(0, 1, 1000)
mean = np.mean(data)
median = np.median(data)
std_dev = np.std(data)
variance = np.var(data)
skewness = pd.Series(data).skew()
kurtosis = pd.Series(data).kurtosis()
# 4. Inferential Statistics
t_stat, p_value_ttest = stats.ttest_1samp(data, 0)
# 5. Comparing Means
group1 = np.random.normal(0, 1, 100)
group2 = np.random.normal(0.5, 1, 100)
t_stat_ind, p_value_ttest_ind = stats.ttest_ind(group1, group2)
# 6. Analysis of Variance (ANOVA)
group3 = np.random.normal(1, 1, 100)
f_stat, p_value_anova = stats.f_oneway(group1, group2, group3)
# 7. Correlation and Regression
x = np.random.normal(0, 1, 100)
y = 2 * x + np.random.normal(0, 1, 100)
correlation = np.corrcoef(x, y)[0, 1]
slope, intercept, r_value, p_value_regression, std_err = stats.linregress(x, y)
# 8. Analysis of Qualitative Data
categories = np.random.choice(['Category A', 'Category B', 'Category C'], 100)
freq_table = pd.Series(categories).value_counts()
# Plotting
fig, axs = plt.subplots(3, 2, figsize=(15, 18))
# 3. Descriptive Statistics Plot
axs[0, 0].hist(data, bins=30, edgecolor='k', alpha=0.7)
axs[0, 0].set_title('Histogram of Data')
axs[0, 0].set_xlabel('Value')
axs[0, 0].set_ylabel('Frequency')
# 4. Inferential Statistics Plot
axs[0, 1].text(0.5, 0.5, f'T-statistic: {t_stat:.2f}\nP-value: {p_value_ttest:.2f}',
fontsize=12, ha='center', va='center')
axs[0, 1].set_title('T-test Results')
axs[0, 1].axis('off')
# 5. Comparing Means Plot
axs[1, 0].bar(['Group 1', 'Group 2'], [np.mean(group1), np.mean(group2)], color=['blue', 'orange'])
axs[1, 0].set_title('Comparison of Means')
axs[1, 0].set_ylabel('Mean')
# 6. Analysis of Variance (ANOVA) Plot
axs[1, 1].boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
axs[1, 1].set_title('ANOVA - Boxplot')
axs[1, 1].set_ylabel('Values')
# 7. Correlation and Regression Plot
sns.regplot(x=x, y=y, ax=axs[2, 0])
axs[2, 0].set_title(f'Correlation: {correlation:.2f}\nR-squared: {r_value**2:.2f}')
axs[2, 0].set_xlabel('X')
axs[2, 0].set_ylabel('Y')
# 8. Analysis of Qualitative Data Plot
axs[2, 1].bar(freq_table.index, freq_table.values, color=['red', 'green', 'blue'])
axs[2, 1].set_title('Frequency of Categories')
axs[2, 1].set_xlabel('Category')
axs[2, 1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
import seaborn as sns
# Load dataset
data = load_iris()
X = data.data
y = data.target
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Initialize models
models = {
'Logistic Regression': LogisticRegression(max_iter=200),
'Support Vector Machine': SVC(),
'Linear Regression': LinearRegression()
}
# Train and evaluate models
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if name == 'Linear Regression':
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {'MSE': mse, 'R^2': r2}
else:
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred, output_dict=True)
results[name] = {'Accuracy': accuracy, 'Confusion Matrix': cm, 'Classification Report': cr}
# Plotting
fig, axs = plt.subplots(2, 2, figsize=(15, 12))
# Confusion Matrix Plot for Logistic Regression and SVM
for idx, (name, result) in enumerate(results.items()):
if name != 'Linear Regression':
sns.heatmap(result['Confusion Matrix'], annot=True, fmt='d', cmap='Blues', ax=axs[idx//2, idx%2])
axs[idx//2, idx%2].set_title(f'{name} - Confusion Matrix')
axs[idx//2, idx%2].set_xlabel('Predicted Label')
axs[idx//2, idx%2].set_ylabel('True Label')
# Display MSE and R^2 for Linear Regression
axs[1, 1].text(0.5, 0.5, f'Linear Regression:\nMSE: {results["Linear Regression"]["MSE"]:.2f}\nR^2: {results["Linear Regression"]["R^2"]:.2f}',
fontsize=12, ha='center', va='center')
axs[1, 1].set_title('Linear Regression Metrics')
axs[1, 1].axis('off')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Generate sample data
np.random.seed(0)
data = pd.DataFrame({
'A': np.random.normal(0, 1, 100),
'B': np.random.normal(1, 2, 100),
'Category': np.random.choice(['Group1', 'Group2'], 100)
})
# Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='A', y='B', hue='Category', data=data, palette='viridis')
plt.title('Scatter Plot of A vs B')
plt.xlabel('A')
plt.ylabel('B')
plt.legend(title='Category')
plt.show()
# Histogram
plt.figure(figsize=(10, 6))
sns.histplot(data['A'], kde=True, color='skyblue')
plt.title('Histogram of A with KDE')
plt.xlabel('A')
plt.ylabel('Frequency')
plt.show()
# Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Category', y='B', data=data, palette='Set2')
plt.title('Box Plot of B by Category')
plt.xlabel('Category')
plt.ylabel('B')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Generate sample economic data
np.random.seed(0)
data = pd.DataFrame({
'GDP': np.random.normal(50000, 10000, 100),
'Unemployment': np.random.normal(5, 1.5, 100),
'Inflation': np.random.normal(2, 0.5, 100)
})
# Summary statistics
print(data.describe())
# GDP vs Unemployment Scatter Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='GDP', y='Unemployment', data=data, color='blue')
plt.title('GDP vs Unemployment')
plt.xlabel('GDP')
plt.ylabel('Unemployment')
plt.show()
# Histogram of Inflation
plt.figure(figsize=(10, 6))
sns.histplot(data['Inflation'], kde=True, color='green')
plt.title('Histogram of Inflation')
plt.xlabel('Inflation Rate')
plt.ylabel('Frequency')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
# Generate sample economic data with classification target
np.random.seed(0)
data = pd.DataFrame({
'GDP': np.random.normal(50000, 10000, 200),
'Unemployment': np.random.normal(5, 1.5, 200),
'Inflation': np.random.normal(2, 0.5, 200),
'Class': np.random.choice([0, 1], 200)
})
# Features and target
X = data[['GDP', 'Unemployment', 'Inflation']]
y = data['Class']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train SVM classifier
model = SVC(kernel='linear', random_state=0)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Generate sample data
np.random.seed(0)
data1 = np.random.normal(100, 15, 100)
data2 = np.random.normal(110, 20, 100)
# Descriptive Statistics
mean1, std1, var1 = np.mean(data1), np.std(data1), np.var(data1)
mean2, std2, var2 = np.mean(data2), np.std(data2), np.var(data2)
print(f"Data1 - Mean: {mean1:.2f}, Std Dev: {std1:.2f}, Variance: {var1:.2f}")
print(f"Data2 - Mean: {mean2:.2f}, Std Dev: {std2:.2f}, Variance: {var2:.2f}")
# T-Test for the means of two independent samples
t_stat, p_value = stats.ttest_ind(data1, data2)
print(f"T-Statistic: {t_stat:.2f}, P-Value: {p_value:.2f}")
# Interpret the p-value
if p_value < 0.05:
print("The difference between the means is statistically significant.")
else:
print("The difference between the means is not statistically significant.")
# Plotting
plt.figure(figsize=(12, 6))
# Histogram
plt.subplot(1, 2, 1)
sns.histplot(data1, kde=True, color='blue', label='Data1', alpha=0.6)
sns.histplot(data2, kde=True, color='red', label='Data2', alpha=0.6)
plt.title('Histogram of Data1 and Data2')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
# Box Plot
plt.subplot(1, 2, 2)
sns.boxplot(data=[data1, data2], palette=['blue', 'red'])
plt.title('Box Plot of Data1 and Data2')
plt.xlabel('Group')
plt.ylabel('Value')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Generate sample data
np.random.seed(0)
data = np.random.normal(100, 15, 1000)
# Create a DataFrame
df = pd.DataFrame(data, columns=['Value'])
# Calculate descriptive statistics
mean = df['Value'].mean()
median = df['Value'].median()
std_dev = df['Value'].std()
variance = df['Value'].var()
min_val = df['Value'].min()
max_val = df['Value'].max()
print(f"Mean: {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Min: {min_val:.2f}")
print(f"Max: {max_val:.2f}")
# Frequency distribution table
bins = np.arange(min_val, max_val + 15, 15) # Define bins with a width of 15
df['Bin'] = pd.cut(df['Value'], bins=bins)
frequency_table = df['Bin'].value_counts().sort_index()
print("\nFrequency Distribution Table:")
print(frequency_table)
# Histogram and Frequency Distribution Plot
plt.figure(figsize=(12, 6))
# Histogram
plt.subplot(1, 2, 1)
sns.histplot(df['Value'], bins=bins, kde=False, color='blue', edgecolor='black')
plt.title('Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
# Frequency Distribution
plt.subplot(1, 2, 2)
frequency_table.plot(kind='bar', color='orange', edgecolor='black')
plt.title('Frequency Distribution')
plt.xlabel('Bins')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# Generate sample data
np.random.seed(0)
data = np.random.normal(100, 15, 1000)
# Create a DataFrame
df = pd.DataFrame(data, columns=['Value'])
# Calculate descriptive statistics
mean = df['Value'].mean()
median = df['Value'].median()
mode = df['Value'].mode()[0]
print(f"Mean: {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Mode: {mode:.2f}")
# Define Sigma (Σ) notation functions
def sigma_sum(data):
return np.sum(data)
# Calculate mean using Sigma notation
N = len(df)
mean_sigma = sigma_sum(df['Value']) / N
print(f"Mean (Sigma notation): {mean_sigma:.2f}")
# Plotting
plt.figure(figsize=(12, 6))
# Histogram with mean, median, and mode
plt.subplot(1, 2, 1)
plt.hist(df['Value'], bins=30, edgecolor='black', alpha=0.7)
plt.axvline(mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean ({mean:.2f})')
plt.axvline(median, color='g', linestyle='dashed', linewidth=1, label=f'Median ({median:.2f})')
plt.axvline(mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode ({mode:.2f})')
plt.title('Histogram with Mean, Median, and Mode')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
# Box plot
plt.subplot(1, 2, 2)
sns.boxplot(x=df['Value'], color='orange')
plt.title('Box Plot')
plt.xlabel('Value')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# Generate sample data
np.random.seed(0)
data = np.random.normal(100, 15, 1000)
# Define weights (for demonstration purposes, we'll use a linear weight)
weights = np.linspace(1, 2, len(data))
# Calculate weighted mean
def weighted_mean(data, weights):
return np.sum(data * weights) / np.sum(weights)
# Calculate geometric mean
def geometric_mean(data):
return stats.gmean(data)
# Calculate statistics
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)[0][0]
weighted_avg = weighted_mean(data, weights)
geo_mean = geometric_mean(data)
print(f"Mean: {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Mode: {mode:.2f}")
print(f"Weighted Mean: {weighted_avg:.2f}")
print(f"Geometric Mean: {geo_mean:.2f}")
# Plotting
plt.figure(figsize=(12, 6))
# Histogram with mean, median, mode, weighted mean, and geometric mean
plt.subplot(1, 2, 1)
plt.hist(data, bins=30, edgecolor='black', alpha=0.7)
plt.axvline(mean, color='r', linestyle='dashed', linewidth=1, label=f'Mean ({mean:.2f})')
plt.axvline(median, color='g', linestyle='dashed', linewidth=1, label=f'Median ({median:.2f})')
plt.axvline(mode, color='b', linestyle='dashed', linewidth=1, label=f'Mode ({mode:.2f})')
plt.axvline(weighted_avg, color='orange', linestyle='dashed', linewidth=1, label=f'Weighted Mean ({weighted_avg:.2f})')
plt.axvline(geo_mean, color='purple', linestyle='dashed', linewidth=1, label=f'Geometric Mean ({geo_mean:.2f})')
plt.title('Histogram with Various Means')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
# Box plot
plt.subplot(1, 2, 2)
sns.boxplot(x=data, color='lightblue')
plt.title('Box Plot')
plt.xlabel('Value')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# Generate sample data
np.random.seed(0)
data = np.random.normal(100, 15, 1000)
# Calculate deviation
deviation = data - np.mean(data)
# Calculate mean absolute deviation (MAD)
mean_absolute_deviation = np.mean(np.abs(deviation))
# Calculate variance
variance = np.var(data, ddof=0)
# Calculate standard deviation
standard_deviation = np.sqrt(variance)
# Calculate coefficient of variation
coefficient_of_variation = (standard_deviation / np.mean(data)) * 100
# Display results
print(f"Mean: {np.mean(data):.2f}")
print(f"Mean Absolute Deviation (MAD): {mean_absolute_deviation:.2f}")
print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {standard_deviation:.2f}")
print(f"Coefficient of Variation: {coefficient_of_variation:.2f}%")
# Plotting
plt.figure(figsize=(12, 6))
# Histogram with Mean, Mean Absolute Deviation, Standard Deviation
plt.subplot(1, 2, 1)
plt.hist(data, bins=30, edgecolor='black', alpha=0.7)
plt.axvline(np.mean(data), color='r', linestyle='dashed', linewidth=1, label=f'Mean ({np.mean(data):.2f})')
plt.axvline(np.mean(data) - standard_deviation, color='g', linestyle='dashed', linewidth=1, label=f'St. Dev. ({standard_deviation:.2f})')
plt.axvline(np.mean(data) + standard_deviation, color='g', linestyle='dashed', linewidth=1)
plt.axvline(np.mean(data) - mean_absolute_deviation, color='orange', linestyle='dashed', linewidth=1, label=f'MAD ({mean_absolute_deviation:.2f})')
plt.axvline(np.mean(data) + mean_absolute_deviation, color='orange', linestyle='dashed', linewidth=1)
plt.title('Histogram with Measures of Dispersion')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
# Box plot
plt.subplot(1, 2, 2)
plt.boxplot(data, vert=False, patch_artist=True)
plt.title('Box Plot')
plt.xlabel('Value')
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# Generate sample data
np.random.seed(0)
data1 = np.random.normal(50, 10, 1000)
data2 = 0.5 * data1 + np.random.normal(0, 5, 1000)
# 1. Lorenz Curve and Gini Coefficient
def lorenz_curve(data):
sorted_data = np.sort(data)
cum_data = np.cumsum(sorted_data)
total_sum = np.sum(sorted_data)
return np.append([0], cum_data / total_sum)
def gini_coefficient(data):
lorenz = lorenz_curve(data)
n = len(data)
x = np.arange(1, n+1) / n
A = np.trapz(lorenz, x)
B = 0.5 - A
return B / (0.5)
# Calculate Lorenz Curve and Gini Coefficient
lorenz = lorenz_curve(data1)
gini_index = gini_coefficient(data1)
# Plot Lorenz Curve
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), 'k--', label='Perfect Equality')
plt.plot(np.linspace(0, 1, len(lorenz)), lorenz, label=f'Lorenz Curve (Gini={gini_index:.2f})')
plt.title('Lorenz Curve')
plt.xlabel('Cumulative Share of Population')
plt.ylabel('Cumulative Share of Income')
plt.legend()
# 2. Standardized Variables
standardized_data1 = (data1 - np.mean(data1)) / np.std(data1)
standardized_data2 = (data2 - np.mean(data2)) / np.std(data2)
# Plot standardized data
plt.subplot(1, 2, 2)
plt.scatter(standardized_data1, standardized_data2, alpha=0.5)
plt.title('Standardized Data Scatter Plot')
plt.xlabel('Standardized Data 1')
plt.ylabel('Standardized Data 2')
plt.tight_layout()
plt.show()
# 3. Covariance
covariance = np.cov(data1, data2)[0, 1]
# 4. Correlation Coefficient
correlation_coefficient = np.corrcoef(data1, data2)[0, 1]
# Display results
print(f"Gini Coefficient: {gini_index:.2f}")
print(f"Covariance: {covariance:.2f}")
print(f"Correlation Coefficient: {correlation_coefficient:.2f}")
# Scatter plot with correlation
plt.figure(figsize=(8, 6))
plt.scatter(data1, data2, alpha=0.5)
plt.title('Scatter Plot with Correlation')
plt.xlabel('Data 1')
plt.ylabel('Data 2')
plt.show()
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats
# 1. Probability Model
def probability_model():
np.random.seed(0)
# Generate sample data
X = np.random.rand(100, 2)
X = sm.add_constant(X) # Add constant term (intercept)
beta_true = np.array([2, -1, 3])
y = X @ beta_true + np.random.normal(0, 1, 100)
# Fit the model
model = sm.OLS(y, X).fit()
print(model.summary())
return X, y, model
# 2. Variation in Coefficients
def coefficient_variation(X, y):
np.random.seed(1)
coefficients = []
for _ in range(1000):
model = sm.OLS(y, X).fit()
coefficients.append(model.params)
coefficients = np.array(coefficients)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.boxplot(coefficients[:, 0], vert=False)
plt.title('Intercept')
plt.subplot(1, 2, 2)
plt.boxplot(coefficients[:, 1:], vert=False)
plt.title('Coefficients')
plt.suptitle('Distribution of Coefficients')
plt.show()
# 3. Distribution of Coefficients
def coefficient_distribution(X, y):
np.random.seed(2)
model = sm.OLS(y, X).fit()
residuals = model.resid
pred = model.predict(X)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(model.params, bins=20, edgecolor='black')
plt.title('Histogram of Coefficients')
plt.subplot(1, 2, 2)
stats.probplot(model.params, dist="norm", plot=plt)
plt.title('QQ Plot of Coefficients')
plt.suptitle('Coefficient Distribution')
plt.show()
# 4. t-test for Coefficients
def t_test_coefficients(X, y):
model = sm.OLS(y, X).fit()
print("t-test results for coefficients:")
print(model.tvalues)
print("p-values for coefficients:")
print(model.pvalues)
# 5. Variable Selection using t-test
def variable_selection(X, y):
model = sm.OLS(y, X).fit()
p_values = model.pvalues[1:] # Exclude intercept
significant_vars = np.where(p_values < 0.05)[0]
print(f"Significant variables (p < 0.05): {significant_vars}")
# Run the functions
X, y, model = probability_model()
coefficient_variation(X, y)
coefficient_distribution(X, y)
t_test_coefficients(X, y)
variable_selection(X, y)
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
# 1. Multiple Regression - Least Squares Method
def multiple_regression(X, y):
X = sm.add_constant(X) # Add constant term (intercept)
model = sm.OLS(y, X).fit()
print("Regression Summary:")
print(model.summary())
return X, y, model
# 2. Goodness of Fit Measures
def goodness_of_fit(model):
print("\nGoodness of Fit Measures:")
print(f"R-squared: {model.rsquared:.4f}")
print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")
print(f"AIC: {model.aic:.4f}")
print(f"BIC: {model.bic:.4f}")
# 3. t-test for Individual Coefficients
def t_test_coefficients(model):
print("\nCoefficients t-tests:")
print("t-values:")
print(model.tvalues)
print("p-values:")
print(model.pvalues)
# 4. Multicollinearity
def multicollinearity(X):
X = sm.add_constant(X)
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factors (VIF):")
print(vif)
# 5. Application of Multiple Regression Analysis
def application_example():
# Generate sample data
np.random.seed(0)
X = np.random.rand(100, 3)
beta_true = np.array([2, -1, 3, 4])
y = X @ beta_true[1:] + beta_true[0] + np.random.normal(0, 1, 100)
X, y, model = multiple_regression(X, y)
goodness_of_fit(model)
t_test_coefficients(model)
multicollinearity(X)
# Predict and plot
predictions = model.predict(X)
plt.figure(figsize=(12, 6))
plt.scatter(y, predictions, edgecolor='k', alpha=0.7)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.show()
# Run the application example
application_example()
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# 1. Model Function Forms
def generate_data():
np.random.seed(0)
X = np.linspace(1, 10, 100)
noise = np.random.normal(0, 1, X.shape)
y = 2 * np.log(X) + noise
return X, y
# 2. Log-linear Model
def log_linear_model(X, y):
X_log = np.log(X)
X_log = sm.add_constant(X_log) # Add constant term (intercept)
model = sm.OLS(y, X_log).fit()
print("\nLog-linear Model Summary:")
print(model.summary())
return model
# 3. Polynomial Model
def polynomial_model(X, y, degree=3):
poly = PolynomialFeatures(degree)
X_poly = poly.fit_transform(X.reshape(-1, 1))
model = LinearRegression().fit(X_poly, y)
y_pred = model.predict(X_poly)
print("\nPolynomial Model Summary:")
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
return model, y_pred
# 4. Hyperbolic Model
def hyperbolic_model(X, y):
X_inv = 1 / X
X_inv = sm.add_constant(X_inv) # Add constant term (intercept)
model = sm.OLS(y, X_inv).fit()
print("\nHyperbolic Model Summary:")
print(model.summary())
return model
# Plotting function
def plot_models(X, y, models, labels):
plt.figure(figsize=(14, 8))
plt.scatter(X, y, color='black', label='Data')
X_range = np.linspace(min(X), max(X), 1000)
for model, label in zip(models, labels):
if label == 'Polynomial':
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_range.reshape(-1, 1))
y_pred = model.predict(X_poly)
elif label == 'Hyperbolic':
X_inv_range = 1 / X_range
y_pred = model.predict(sm.add_constant(X_inv_range))
elif label == 'Log-linear':
X_log_range = np.log(X_range)
y_pred = model.predict(sm.add_constant(X_log_range))
plt.plot(X_range, y_pred, label=f'{label} Model')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Curve Fitting with Different Models')
plt.legend()
plt.show()
# Run the analysis
X, y = generate_data()
log_model = log_linear_model(X, y)
poly_model, y_poly_pred = polynomial_model(X, y)
hyper_model = hyperbolic_model(X, y)
# Plot all models
plot_models(X, y, [log_model, poly_model, hyper_model], ['Log-linear', 'Polynomial', 'Hyperbolic'])
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
# サンプルデータの作成
np.random.seed(0)
n = 100
X = np.random.rand(n)
X_cat = np.random.choice(['A', 'B', 'C'], size=n) # カテゴリカル変数
y = 5 + 2 * X + (X_cat == 'A') * 3 + (X_cat == 'B') * 2 + np.random.randn(n)
# データフレームの作成
data = pd.DataFrame({'X': X, 'X_cat': X_cat, 'y': y})
# 1. 一時的ダミー変数
# カテゴリカル変数をダミー変数に変換(1時的ダミー変数)
data_dummies = pd.get_dummies(data['X_cat'], drop_first=True) # 'A'を基準カテゴリとする
data_with_dummies = pd.concat([data, data_dummies], axis=1)
X_dummies = sm.add_constant(data_with_dummies[['X', 'B', 'C']])
model_dummies = sm.OLS(data_with_dummies['y'], X_dummies).fit()
print("\n1. 一時的ダミー変数による回帰結果:")
print(model_dummies.summary())
# 2. 定数項ダミー変数
# 定数項ダミー変数を追加(基準カテゴリなし)
data_with_constant_dummies = pd.concat([data, data_dummies], axis=1)
X_constant_dummies = sm.add_constant(data_with_constant_dummies[['X', 'B', 'C']])
model_constant_dummies = sm.OLS(data_with_constant_dummies['y'], X_constant_dummies).fit()
print("\n2. 定数項ダミー変数による回帰結果:")
print(model_constant_dummies.summary())
# 3. 係数ダミー変数
# 係数ダミー変数(カテゴリによる異なる係数を持つ)
X_cat_dummies = pd.get_dummies(data['X_cat'], drop_first=False)
data_with_coefficients = pd.concat([data, X_cat_dummies], axis=1)
X_coefficients = sm.add_constant(data_with_coefficients[['X', 'A', 'B', 'C']])
model_coefficients = sm.OLS(data_with_coefficients['y'], X_coefficients).fit()
print("\n3. 係数ダミー変数による回帰結果:")
print(model_coefficients.summary())
# 結果の可視化
plt.figure(figsize=(12, 8))
# 一時的ダミー変数の結果
plt.subplot(3, 1, 1)
plt.scatter(data['X'], data['y'], label='Data', color='blue')
plt.plot(data['X'], model_dummies.predict(X_dummies), label='Regression Line (Temporary Dummies)', color='red')
plt.title('Regression with Temporary Dummy Variables')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
# 定数項ダミー変数の結果
plt.subplot(3, 1, 2)
plt.scatter(data['X'], data['y'], label='Data', color='blue')
plt.plot(data['X'], model_constant_dummies.predict(X_constant_dummies), label='Regression Line (Constant Dummies)', color='green')
plt.title('Regression with Constant Dummy Variables')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
# 係数ダミー変数の結果
plt.subplot(3, 1, 3)
plt.scatter(data['X'], data['y'], label='Data', color='blue')
plt.plot(data['X'], model_coefficients.predict(X_coefficients), label='Regression Line (Coefficient Dummies)', color='purple')
plt.title('Regression with Coefficient Dummy Variables')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 時系列データの生成
np.random.seed(0)
dates = pd.date_range(start='2020-01-01', periods=100, freq='D')
data = np.cumsum(np.random.randn(100)) # 累積和をとることでランダムウォークを生成
# データフレームの作成
df = pd.DataFrame({'Date': dates, 'Value': data})
df.set_index('Date', inplace=True)
# 指数移動平均の計算
df['EMA'] = df['Value'].ewm(span=20, adjust=False).mean()
# 単純移動平均の計算
df['SMA'] = df['Value'].rolling(window=20).mean()
# 変化率の計算
df['Change'] = df['Value'].diff()
# プロットの描画
plt.figure(figsize=(14, 10))
# 時系列グラフ
plt.subplot(3, 1, 1)
plt.plot(df.index, df['Value'], label='Value')
plt.plot(df.index, df['EMA'], label='EMA (20 days)', color='orange')
plt.plot(df.index, df['SMA'], label='SMA (20 days)', color='green')
plt.title('Time Series Data with EMA and SMA')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
# 変化率のグラフ
plt.subplot(3, 1, 2)
plt.plot(df.index, df['Change'], label='Change', color='red')
plt.title('Change Rate')
plt.xlabel('Date')
plt.ylabel('Change')
plt.legend()
plt.grid(True)
# 移動平均のグラフ
plt.subplot(3, 1, 3)
plt.plot(df.index, df['Value'], label='Value')
plt.plot(df.index, df['SMA'], label='SMA (20 days)', color='green')
plt.title('Simple Moving Average')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
from statsmodels.tsa.seasonal import seasonal_decompose
# 加法的モデルによる分解
decomposition_additive = seasonal_decompose(df['Value'], model='additive', period=20)
# 乗法的モデルによる分解
decomposition_multiplicative = seasonal_decompose(df['Value'], model='multiplicative', period=20)
# 分解結果のプロット
plt.figure(figsize=(14, 12))
# 加法的モデルの分解結果
plt.subplot(4, 1, 1)
plt.plot(df.index, df['Value'], label='Original')
plt.title('Original Time Series')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 2)
plt.plot(decomposition_additive.trend.index, decomposition_additive.trend, label='Trend')
plt.title('Additive Model - Trend')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 3)
plt.plot(decomposition_additive.seasonal.index, decomposition_additive.seasonal, label='Seasonality')
plt.title('Additive Model - Seasonality')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 4)
plt.plot(decomposition_additive.resid.index, decomposition_additive.resid, label='Residual')
plt.title('Additive Model - Residual')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# 乗法的モデルの分解結果
plt.figure(figsize=(14, 12))
# 乗法的モデルの分解結果
plt.subplot(4, 1, 1)
plt.plot(df.index, df['Value'], label='Original')
plt.title('Original Time Series')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 2)
plt.plot(decomposition_multiplicative.trend.index, decomposition_multiplicative.trend, label='Trend')
plt.title('Multiplicative Model - Trend')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 3)
plt.plot(decomposition_multiplicative.seasonal.index, decomposition_multiplicative.seasonal, label='Seasonality')
plt.title('Multiplicative Model - Seasonality')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.subplot(4, 1, 4)
plt.plot(decomposition_multiplicative.resid.index, decomposition_multiplicative.resid, label='Residual')
plt.title('Multiplicative Model - Residual')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# 第3章: 一部から全体を知る
# 1. 母集団と標本
np.random.seed(0)
population = np.random.normal(loc=50, scale=10, size=1000) # 母集団データの生成
sample_size = 100
sample = np.random.choice(population, size=sample_size, replace=False) # 標本の抽出
# 標本統計量の計算
sample_mean = np.mean(sample)
sample_std = np.std(sample, ddof=1)
print(f'Sample Mean: {sample_mean:.2f}')
print(f'Sample Standard Deviation: {sample_std:.2f}')
# 2. 確率モデルによる記述
plt.figure(figsize=(12, 6))
sns.histplot(sample, kde=True, stat='density', linewidth=0)
plt.title('Sample Distribution with Probability Density Function')
plt.xlabel('Value')
plt.ylabel('Density')
plt.show()
# 第4章: 一部から全体の特徴を当てる
# 1. 仮説検定
population_mean = 50
t_stat, p_value = stats.ttest_1samp(sample, population_mean)
print(f'T-statistic: {t_stat:.2f}')
print(f'P-value: {p_value:.4f}')
alpha = 0.05
if p_value < alpha:
print("Reject the null hypothesis: The sample mean is significantly different from the population mean.")
else:
print("Fail to reject the null hypothesis: The sample mean is not significantly different from the population mean.")
# 2. 推定
confidence_level = 0.95
degrees_freedom = sample_size - 1
confidence_interval = stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_std / np.sqrt(sample_size))
print(f'95% Confidence Interval for the Mean: ({confidence_interval[0]:.2f}, {confidence_interval[1]:.2f})')
# 第5章: 離散から連続へ
# 1. 連続値データの確率分布
mean = 50
std_dev = 10
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
pdf = stats.norm.pdf(x, mean, std_dev)
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label='Normal Distribution PDF')
plt.title('Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()
# 2. 正規分布の確率計算
prob_less_than_60 = stats.norm.cdf(60, mean, std_dev)
prob_between_40_and_60 = stats.norm.cdf(60, mean, std_dev) - stats.norm.cdf(40, mean, std_dev)
print(f'Probability of being less than 60: {prob_less_than_60:.4f}')
print(f'Probability of being between 40 and 60: {prob_between_40_and_60:.4f}')
import numpy as np
from scipy import stats
# サンプルデータの生成(正規分布に従わない例)
np.random.seed(0)
sample1 = np.random.exponential(scale=2, size=50) # サンプル1
sample2 = np.random.exponential(scale=3, size=50) # サンプル2
# ウィルコクソンの順位和検定
stat, p_value = stats.ranksums(sample1, sample2)
print(f'Wilcoxon Rank-Sum Test Statistic: {stat:.2f}')
print(f'P-value: {p_value:.4f}')
# 同順位がある場合のサンプルデータ生成
sample1_with_ties = np.concatenate([sample1, [5, 5, 5]]) # 同順位の追加
sample2_with_ties = np.concatenate([sample2, [3, 3]]) # 同順位の追加
# ウィルコクソンの順位和検定(同順位を含む)
stat_with_ties, p_value_with_ties = stats.ranksums(sample1_with_ties, sample2_with_ties)
print(f'Wilcoxon Rank-Sum Test Statistic (with ties): {stat_with_ties:.2f}')
print(f'P-value (with ties): {p_value_with_ties:.4f}')
import numpy as np
from scipy import stats
# 患者対照研究のデータ(2x2 クロス集計表)
# 例: 疾患の有無と治療の有無に関するデータ
# 治療有り 治療無し
# 疾患あり 20 10
# 疾患なし 15 25
contingency_table = np.array([[20, 10], [15, 25]])
# フィッシャーの直接確立法による検定
odds_ratio, p_value = stats.fisher_exact(contingency_table)
print(f'Odds Ratio: {odds_ratio:.2f}')
print(f'P-value: {p_value:.4f}')
import numpy as np
from scipy import stats
# クロス集計表(例: 疾患の有無と治療の有無)
contingency_table = np.array([[20, 10], [15, 25]])
# フィッシャーの直接確立法による検定
odds_ratio, p_value = stats.fisher_exact(contingency_table)
print(f'Odds Ratio: {odds_ratio:.2f}')
print(f'P-value: {p_value:.4f}')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
# データの生成
np.random.seed(0)
n_samples = 100
dose = np.linspace(0, 100, n_samples) # 薬の量
effect = 5 + 0.3 * dose + np.random.normal(scale=5, size=n_samples) # 効き目(線形関係とノイズ)
# 用量反応関係の評価(線形回帰)
X = dose.reshape(-1, 1)
y = effect
# 線形回帰モデルの適用
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
plt.figure(figsize=(12, 6))
plt.scatter(dose, effect, color='blue', label='Observed Data')
plt.plot(dose, y_pred, color='red', label='Linear Regression Line')
plt.title('Dose-Response Relationship')
plt.xlabel('Dose')
plt.ylabel('Effect')
plt.legend()
plt.grid(True)
plt.show()
# 用量と効き目のデータ(ロジスティック回帰用)
# 効き目が閾値を超えるかどうかを二値化
threshold = 20
binary_effect = (effect > threshold).astype(int)
# ロジスティック回帰モデルの適用
X_train, X_test, y_train, y_test = train_test_split(X, binary_effect, test_size=0.3, random_state=0)
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_proba = logistic_model.predict_proba(X_test)[:, 1]
# ROC曲線の計算
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(12, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
plt.title('Receiver Operating Characteristic Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()
# 閾値の選定
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f'Optimal Threshold for Effect: {optimal_threshold:.2f}')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels import api as sm
# データの準備
np.random.seed(0)
data_size = 100
age = np.random.randint(20, 80, size=data_size) # 年齢
death_rate = 0.5 * (age / 80) + np.random.normal(scale=0.1, size=data_size) # 死亡率
standard_population = np.random.randint(500, 1000, size=data_size) # 標準化人口
confounder = np.random.normal(size=data_size) # 交絡因子
data = pd.DataFrame({'age': age, 'death_rate': death_rate, 'standard_population': standard_population, 'confounder': confounder})
# 1. 年齢調整済死亡率のプロット
def age_adjusted_death_rate(df):
age_groups = pd.cut(df['age'], bins=[20, 40, 60, 80], labels=['20-39', '40-59', '60-79'])
grouped = df.groupby(age_groups).agg({'death_rate': 'mean', 'standard_population': 'sum'})
total_standard_population = grouped['standard_population'].sum()
grouped['age_adjusted_death_rate'] = (grouped['death_rate'] * grouped['standard_population'] / total_standard_population).sum()
return grouped
age_adjusted = age_adjusted_death_rate(data)
plt.figure(figsize=(12, 6))
sns.barplot(x=age_adjusted.index, y=age_adjusted['age_adjusted_death_rate'], palette='viridis')
plt.title('Age-Adjusted Death Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Age-Adjusted Death Rate')
plt.grid(True)
plt.show()
# 2. 層別化による交絡因子の調整のプロット
def stratify_and_plot(df, stratify_by):
strata = df.groupby(stratify_by)
plt.figure(figsize=(12, 6))
for name, group in strata:
plt.scatter(group['age'], group['death_rate'], label=f'{stratify_by} = {name}', alpha=0.6)
plt.title(f'Stratified Analysis by {stratify_by}')
plt.xlabel('Age')
plt.ylabel('Death Rate')
plt.legend()
plt.grid(True)
plt.show()
# 層別化による分析(ここでは年齢を基準に層別化する例)
stratify_and_plot(data, 'age')
# 3. 偏相関係数による交絡因子の調整のプロット
def partial_correlation(df, x, y, control):
model_x = sm.OLS(df[x], sm.add_constant(df[control])).fit()
model_y = sm.OLS(df[y], sm.add_constant(df[control])).fit()
residuals_x = model_x.resid
residuals_y = model_y.resid
correlation = stats.pearsonr(residuals_x, residuals_y)[0]
plt.figure(figsize=(12, 6))
plt.scatter(residuals_x, residuals_y, alpha=0.6)
plt.title(f'Partial Correlation between {x} and {y} (controlling for {control})')
plt.xlabel(f'Residuals of {x}')
plt.ylabel(f'Residuals of {y}')
plt.grid(True)
plt.show()
return correlation
partial_corr = partial_correlation(data, 'age', 'death_rate', 'confounder')
print('Partial Correlation Coefficient:', partial_corr)
import pandas as pd
import matplotlib.pyplot as plt
# 1. 確率変数と確率分布の例
# サイコロの目の確率分布
dice_sides = [1, 2, 3, 4, 5, 6]
probabilities = [1/6] * 6 # 各目の確率は1/6
# 確率変数とその確率分布を表示
df = pd.DataFrame({'Side': dice_sides, 'Probability': probabilities})
print("Dice Probability Distribution:\n", df)
# 2. 期待値(平均)と分散の計算
mean_dice = np.dot(dice_sides, probabilities)
variance_dice = np.dot((np.array(dice_sides) - mean_dice)**2, probabilities)
print("\nExpected Value (Mean) of Dice Roll:", mean_dice)
print("Variance of Dice Roll:", variance_dice)
# 期待値と分散のプロット
plt.figure(figsize=(12, 6))
plt.bar(dice_sides, probabilities, color='skyblue', edgecolor='black')
plt.xlabel('Dice Side')
plt.ylabel('Probability')
plt.title('Probability Distribution of Dice Roll')
plt.grid(True)
plt.show()
# コラム: 宝くじの期待値
# 宝くじの例:1枚あたりの価格、当選確率、賞金
ticket_price = 1.0 # 宝くじ1枚あたりの価格
prizes = [100, 50, 10] # 賞金(複数の賞金)
prize_probabilities = [1/1000, 1/500, 1/100] # 各賞金の当選確率
total_probability = sum(prize_probabilities) # 全ての当選確率の合計
# 期待値の計算
expected_winning = sum(p * prize for p, prize in zip(prize_probabilities, prizes))
expected_value = expected_winning - ticket_price
print("\nExpected Value of Lottery Ticket:", expected_value)
# 宝くじの期待値のプロット
plt.figure(figsize=(12, 6))
prize_labels = [f'Prize ${p}' for p in prizes]
plt.bar(prize_labels, prize_probabilities, color='lightgreen', edgecolor='black')
plt.xlabel('Prize')
plt.ylabel('Probability')
plt.title('Probability Distribution of Lottery Prizes')
plt.grid(True)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
# 1. 一様分布のプロット
uniform_samples = np.random.uniform(low=0, high=10, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(uniform_samples, bins=30, kde=True, color='skyblue')
plt.title('Uniform Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 2. 指数分布のプロット
lambda_param = 1.0
exponential_samples = np.random.exponential(scale=1/lambda_param, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(exponential_samples, bins=30, kde=True, color='lightgreen')
plt.title('Exponential Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 3. 正規分布のプロット
mu, sigma = 0, 1
normal_samples = np.random.normal(loc=mu, scale=sigma, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(normal_samples, bins=30, kde=True, color='salmon')
plt.title('Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 4. その他の分布(ベータ分布)のプロット
alpha, beta = 2, 5
beta_samples = np.random.beta(a=alpha, b=beta, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(beta_samples, bins=30, kde=True, color='orchid')
plt.title('Beta Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# コラム: 正規分布と偏差値
def calculate_deviation_scores(data):
mean = np.mean(data)
std_dev = np.std(data)
deviation_scores = (data - mean) / std_dev
return deviation_scores
# 偏差値の計算
deviation_scores = calculate_deviation_scores(normal_samples)
percentiles = stats.norm.cdf(deviation_scores) * 100 # 標準正規分布の累積確率を百分率で
plt.figure(figsize=(12, 6))
sns.histplot(percentiles, bins=30, kde=True, color='coral')
plt.title('Percentile Ranks from Deviation Scores')
plt.xlabel('Percentile')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
print(f"Mean of Normal Distribution: {np.mean(normal_samples)}")
print(f"Standard Deviation of Normal Distribution: {np.std(normal_samples)}")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
# 1. 確率の基本定義
# 確率の定義: 0から1の間の値
def probability(event_count, total_count):
return event_count / total_count
# サンプルデータ
total_outcomes = 1000
event_count = 250
prob = probability(event_count, total_outcomes)
print(f"Probability of the event: {prob:.2f}")
# 2. 確率の基本定理
# 加法定理: P(A or B) = P(A) + P(B) - P(A and B)
P_A = 0.4
P_B = 0.5
P_A_and_B = 0.2
P_A_or_B = P_A + P_B - P_A_and_B
print(f"P(A or B): {P_A_or_B:.2f}")
# 乗法定理: P(A and B) = P(A) * P(B) if A and B are independent
P_A = 0.3
P_B = 0.6
P_A_and_B_independent = P_A * P_B
print(f"P(A and B) for independent events: {P_A_and_B_independent:.2f}")
# 3. 条件付き確率
# P(B|A) = P(A and B) / P(A)
P_A = 0.7
P_A_and_B = 0.3
P_B_given_A = P_A_and_B / P_A
print(f"P(B|A): {P_B_given_A:.2f}")
# 4. ベイズの定理
# P(A|B) = P(B|A) * P(A) / P(B)
P_B_given_A = 0.8
P_A = 0.5
P_B = 0.6
P_A_given_B = P_B_given_A * P_A / P_B
print(f"P(A|B): {P_A_given_B:.2f}")
# コラム: 天気予報と統計学
# サンプルデータ: 過去の天気データ(晴れ、雨)
np.random.seed(42)
weather_data = np.random.choice(['晴れ', '雨'], size=1000, p=[0.7, 0.3])
# 天気予報の確率計算
prob_sunny = np.mean(weather_data == '晴れ')
prob_rainy = np.mean(weather_data == '雨')
print(f"Probability of sunny weather: {prob_sunny:.2f}")
print(f"Probability of rainy weather: {prob_rainy:.2f}")
# 条件付き確率のシミュレーション: 晴れの日の後に雨が降る確率
rain_after_sunny = np.mean(np.roll(weather_data, shift=-1) == '雨')[weather_data == '晴れ']
print(f"Probability of rain after a sunny day: {rain_after_sunny:.2f}")
# ベイズの定理を用いた予測
P_sunny = prob_sunny
P_rain_given_sunny = rain_after_sunny
P_rain = prob_rainy
P_sunny_given_rain = P_rain_given_sunny * P_sunny / P_rain
print(f"Probability of sunny weather given that it is raining: {P_sunny_given_rain:.2f}")
# 結果のプロット
plt.figure(figsize=(12, 6))
sns.histplot(weather_data, kde=False, color='lightblue', bins=2)
plt.title('Weather Distribution')
plt.xlabel('Weather')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['晴れ', '雨'])
plt.grid(True)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
# 1. ベルヌーイ分布のプロット
p = 0.5 # 成功の確率
bernoulli_samples = np.random.binomial(n=1, p=p, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(bernoulli_samples, bins=2, kde=False, color='lightblue')
plt.title('Bernoulli Distribution')
plt.xlabel('Outcome')
plt.ylabel('Frequency')
plt.xticks(ticks=[0, 1], labels=['Failure (0)', 'Success (1)'])
plt.grid(True)
plt.show()
# 2. 幾何分布のプロット
p = 0.5 # 成功の確率
geometric_samples = np.random.geometric(p=p, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(geometric_samples, bins=30, kde=True, color='lightgreen')
plt.title('Geometric Distribution')
plt.xlabel('Number of Trials')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 3. 二項分布のプロット
n = 10 # 試行回数
p = 0.5 # 成功の確率
binomial_samples = np.random.binomial(n=n, p=p, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(binomial_samples, bins=range(n+2), kde=True, color='salmon')
plt.title('Binomial Distribution')
plt.xlabel('Number of Successes')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 4. 負の二項分布のプロット
r = 5 # 成功回数
p = 0.5 # 成功の確率
negative_binomial_samples = np.random.negative_binomial(n=r, p=p, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(negative_binomial_samples, bins=30, kde=True, color='orchid')
plt.title('Negative Binomial Distribution')
plt.xlabel('Number of Trials')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# 5. ポアソン分布のプロット
lambda_param = 3 # 平均発生回数
poisson_samples = np.random.poisson(lam=lambda_param, size=1000)
plt.figure(figsize=(12, 6))
sns.histplot(poisson_samples, bins=30, kde=True, color='coral')
plt.title('Poisson Distribution')
plt.xlabel('Number of Events')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# コラム: 確率過程のシミュレーション(コイン投げ)
num_flips = 1000
flips = np.random.choice([0, 1], size=num_flips, p=[0.5, 0.5])
cumulative_sum = np.cumsum(flips)
plt.figure(figsize=(12, 6))
plt.plot(cumulative_sum, color='blue')
plt.title('Cumulative Sum of Coin Flips (Probability Process)')
plt.xlabel('Number of Flips')
plt.ylabel('Cumulative Sum')
plt.grid(True)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# 1. 平均の推定
np.random.seed(42)
sample_data = np.random.normal(loc=50, scale=10, size=100) # サンプルデータ
sample_mean = np.mean(sample_data)
sample_std = np.std(sample_data, ddof=1)
n = len(sample_data)
# 信頼区間の計算
confidence_level = 0.95
z_score = stats.norm.ppf((1 + confidence_level) / 2)
margin_of_error = z_score * (sample_std / np.sqrt(n))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
print(f"Sample Mean: {sample_mean:.2f}")
print(f"95% Confidence Interval for the Mean: {confidence_interval[0]:.2f} to {confidence_interval[1]:.2f}")
# 2. 区間推定
# プロット
plt.figure(figsize=(12, 6))
plt.hist(sample_data, bins=20, edgecolor='k', alpha=0.7)
plt.axvline(sample_mean, color='red', linestyle='dashed', linewidth=1)
plt.axvline(confidence_interval[0], color='blue', linestyle='dashed', linewidth=1)
plt.axvline(confidence_interval[1], color='blue', linestyle='dashed', linewidth=1)
plt.title('Histogram of Sample Data with 95% Confidence Interval')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend(['Sample Mean', '95% Confidence Interval'])
plt.grid(True)
plt.show()
# 3. 比率の推定
successes = 30
trials = 50
sample_proportion = successes / trials
sample_proportion_std = np.sqrt(sample_proportion * (1 - sample_proportion) / trials)
# 信頼区間の計算
z_score = stats.norm.ppf((1 + confidence_level) / 2)
margin_of_error_proportion = z_score * sample_proportion_std
confidence_interval_proportion = (sample_proportion - margin_of_error_proportion, sample_proportion + margin_of_error_proportion)
print(f"Sample Proportion: {sample_proportion:.2f}")
print(f"95% Confidence Interval for the Proportion: {confidence_interval_proportion[0]:.2f} to {confidence_interval_proportion[1]:.2f}")
# コラム: 正規近似
# 二項分布の正規近似
n_binom = 50
p_binom = 0.6
binom_samples = np.random.binomial(n=n_binom, p=p_binom, size=1000)
mean_binom = n_binom * p_binom
std_binom = np.sqrt(n_binom * p_binom * (1 - p_binom))
# 正規分布のサンプル
norm_samples = np.random.normal(loc=mean_binom, scale=std_binom, size=1000)
plt.figure(figsize=(12, 6))
plt.hist(binom_samples, bins=30, density=True, alpha=0.6, color='blue', label='Binomial Distribution')
plt.hist(norm_samples, bins=30, density=True, alpha=0.6, color='red', label='Normal Approximation')
plt.title('Binomial Distribution vs. Normal Approximation')
plt.xlabel('Number of Successes')
plt.ylabel('Density')
plt.legend()
plt.gri
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# 1. 1標本t検定
np.random.seed(42)
sample_data = np.random.normal(loc=50, scale=10, size=30) # サンプルデータ
known_mean = 52
# t検定の実施
t_stat, p_value = stats.ttest_1samp(sample_data, known_mean)
print(f"1-Sample t-Test: t-statistic = {t_stat:.2f}, p-value = {p_value:.4f}")
# 2. 対標本の場合の2標本t検定
# 介入前後のデータ
pre_treatment = np.random.normal(loc=50, scale=10, size=30)
post_treatment = pre_treatment + np.random.normal(loc=5, scale=5, size=30)
# 対応のあるt検定の実施
t_stat_paired, p_value_paired = stats.ttest_rel(pre_treatment, post_treatment)
print(f"Paired t-Test: t-statistic = {t_stat_paired:.2f}, p-value = {p_value_paired:.4f}")
# 3. 対でない標本の場合の2標本t検定
# 2つの異なるグループのデータ
group1 = np.random.normal(loc=50, scale=10, size=30)
group2 = np.random.normal(loc=55, scale=10, size=30)
# 対応のないt検定の実施
t_stat_independent, p_value_independent = stats.ttest_ind(group1, group2)
print(f"Independent t-Test: t-statistic = {t_stat_independent:.2f}, p-value = {p_value_independent:.4f}")
# コラム: 正規と正則
# 正規性の検定
_, p_value_normality = stats.shapiro(sample_data)
print(f"Shapiro-Wilk Test for Normality: p-value = {p_value_normality:.4f}")
# プロット
plt.figure(figsize=(12, 6))
# 1標本t検定データのプロット
plt.subplot(2, 2, 1)
plt.hist(sample_data, bins=15, edgecolor='k', alpha=0.7)
plt.axvline(known_mean, color='red', linestyle='dashed', linewidth=1)
plt.title('1-Sample t-Test Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
# 対標本のプロット
plt.subplot(2, 2, 2)
plt.plot(pre_treatment, label='Pre-Treatment', marker='o', linestyle='-', color='blue')
plt.plot(post_treatment, label='Post-Treatment', marker='o', linestyle='--', color='green')
plt.title('Paired t-Test Data')
plt.xlabel('Sample Index')
plt.ylabel('Value')
plt.legend()
# 対でない標本のプロット
plt.subplot(2, 2, 3)
plt.hist(group1, bins=15, alpha=0.5, label='Group 1', color='blue')
plt.hist(group2, bins=15, alpha=0.5, label='Group 2', color='green')
plt.title('Independent t-Test Data')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
# 正規性のプロット
plt.subplot(2, 2, 4)
stats.probplot(sample_data, dist="norm", plot=plt)
plt.title('Q-Q Plot for Normality Test')
plt.tight_layout()
plt.show()
import numpy as np
import scipy.stats as stats
import scipy.optimize as opt
import matplotlib.pyplot as plt
# 1. 最尤法の考え方
# データの生成(例: 正規分布)
np.random.seed(42)
data = np.random.normal(loc=5, scale=2, size=100)
# 尤度関数
def likelihood(params, data):
mu, sigma = params
return -np.sum(stats.norm.logpdf(data, loc=mu, scale=sigma))
# 最尤推定
initial_guess = [0, 1]
result = opt.minimize(likelihood, initial_guess, args=(data,), bounds=[(None, None), (0.1, None)])
mu_mle, sigma_mle = result.x
print(f"MLE of mu: {mu_mle:.2f}")
print(f"MLE of sigma: {sigma_mle:.2f}")
# 2. 最尤法:連続分布の場合
# プロット
plt.figure(figsize=(12, 6))
# データのヒストグラム
plt.hist(data, bins=20, density=True, alpha=0.6, color='blue', label='Sample Data')
# 理論分布のプロット
x = np.linspace(min(data), max(data), 100)
pdf = stats.norm.pdf(x, loc=mu_mle, scale=sigma_mle)
plt.plot(x, pdf, 'r-', lw=2, label='MLE Fit')
plt.title('Histogram and MLE Fit')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# 3. 最尤推定量の性質
# 尤度関数の計算とプロット
def likelihood_function(mu, sigma):
return -np.sum(stats.norm.logpdf(data, loc=mu, scale=sigma))
mu_range = np.linspace(mu_mle - 2, mu_mle + 2, 100)
sigma_range = np.linspace(sigma_mle - 1, sigma_mle + 1, 100)
likelihood_values = np.array([[likelihood_function(mu, sigma) for sigma in sigma_range] for mu in mu_range])
# 最尤推定量のプロット
plt.figure(figsize=(12, 6))
plt.contourf(mu_range, sigma_range, -likelihood_values, levels=20, cmap='viridis')
plt.colorbar(label='Negative Log-Likelihood')
plt.scatter(mu_mle, sigma_mle, color='red', marker='x', s=100, label='MLE')
plt.title('Log-Likelihood Function')
plt.xlabel('mu')
plt.ylabel('sigma')
plt.legend()
plt.grid(True)
plt.show()
# 4. 尤度関数の最大化とニュートン法
# ニュートン法による最尤推定
def likelihood_function_newton(params):
mu, sigma = params
return np.sum(stats.norm.logpdf(data, loc=mu, scale=sigma))
result_newton = opt.minimize(lambda params: -likelihood_function_newton(params), initial_guess, bounds=[(None, None), (0.1, None)], method='Newton-CG', jac=lambda params: -np.array([stats.norm.pdf(data, loc=params[0], scale=params[1]).sum(),
(stats.norm.pdf(data, loc=params[0], scale=params[1]) * (data - params[0]) / params[1]**2).sum()]))
mu_mle_newton, sigma_mle_newton = result_newton.x
print(f"MLE of mu (Newton's method): {mu_mle_newton:.2f}")
print(f"MLE of sigma (Newton's method): {sigma_mle_newton:.2f}")
# ニュートン法による最尤推定結果のプロット
plt.figure(figsize=(12, 6))
plt.hist(data, bins=20, density=True, alpha=0.6, color='blue', label='Sample Data')
pdf_newton = stats.norm.pdf(x, loc=mu_mle_newton, scale=sigma_mle_newton)
plt.plot(x, pdf_newton, 'g-', lw=2, label='MLE Fit (Newton)')
plt.title('Histogram and MLE Fit (Newton\'s Method)')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import matplotlib.pyplot as plt
# 1. 不完全データとは
# 例として右打ち切りデータを生成
np.random.seed(42)
n = 100
true_mean = 5
true_scale = 2
# 生存時間データ
survival_times = np.random.exponential(scale=true_mean, size=n)
censoring_times = np.random.uniform(low=0, high=10, size=n)
observed_times = np.minimum(survival_times, censoring_times)
censored = survival_times > censoring_times
# データフレームの作成
data = pd.DataFrame({
'Time': observed_times,
'Censored': censored
})
# 2. カプラン―マイヤー法
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(durations=data['Time'], event_observed=~data['Censored'])
# プロット
plt.figure(figsize=(12, 6))
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.show()
# 3. 指数分布のパラメータ推定
# 指数分布のパラメータ(平均)を推定
lambda_mle = 1 / np.mean(data['Time'])
print(f"MLE of lambda for Exponential Distribution: {lambda_mle:.2f}")
# 指数分布のフィットプロット
x = np.linspace(0, max(data['Time']), 100)
pdf_exp = lambda_mle * np.exp(-lambda_mle * x)
plt.figure(figsize=(12, 6))
plt.hist(data['Time'], bins=20, density=True, alpha=0.6, color='blue', label='Observed Data')
plt.plot(x, pdf_exp, 'r-', lw=2, label='Exponential Fit')
plt.title('Exponential Distribution Fit')
plt.xlabel('Time')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# 4. ワイブル分析のパラメータ推定
# ワイブル分布の尤度関数
def weibull_neg_log_likelihood(params, data):
c, scale = params
return -np.sum(stats.weibull_min.logpdf(data, c, scale=scale))
# 初期推定値
initial_guess_weibull = [1.5, np.mean(data['Time'])]
# 最尤推定
result_weibull = opt.minimize(weibull_neg_log_likelihood, initial_guess_weibull, args=(data['Time'],))
shape_mle, scale_mle = result_weibull.x
print(f"MLE of shape parameter (c) for Weibull Distribution: {shape_mle:.2f}")
print(f"MLE of scale parameter for Weibull Distribution: {scale_mle:.2f}")
# ワイブル分布のフィットプロット
pdf_weibull = stats.weibull_min.pdf(x, shape_mle, scale=scale_mle)
plt.figure(figsize=(12, 6))
plt.hist(data['Time'], bins=20, density=True, alpha=0.6, color='blue', label='Observed Data')
plt.plot(x, pdf_weibull, 'g-', lw=2, label='Weibull Fit')
plt.title('Weibull Distribution Fit')
plt.xlabel('Time')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# 1. シミュレーションと統計手法
# シミュレーションの実施例: 正規分布に従うデータの生成と統計量の計算
np.random.seed(42)
n = 1000
mean = 0
std_dev = 1
# 正規分布に従う乱数の生成
data = np.random.normal(loc=mean, scale=std_dev, size=n)
# 統計量の計算
mean_estimate = np.mean(data)
std_dev_estimate = np.std(data)
print(f"Estimated Mean: {mean_estimate:.2f}")
print(f"Estimated Standard Deviation: {std_dev_estimate:.2f}")
# プロット
plt.figure(figsize=(12, 6))
plt.hist(data, bins=30, density=True, alpha=0.6, color='blue', label='Generated Data')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mean, std_dev)
plt.plot(x, p, 'k', linewidth=2, label='Theoretical Normal Distribution')
plt.title('Histogram and Theoretical Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# 2. 与えられた確率分布に従う乱数の生成
# 指数分布に従う乱数の生成
lambda_exp = 1.5
exp_data = np.random.exponential(scale=1/lambda_exp, size=n)
# プロット
plt.figure(figsize=(12, 6))
plt.hist(exp_data, bins=30, density=True, alpha=0.6, color='green', label='Generated Data')
x = np.linspace(0, max(exp_data), 100)
p_exp = lambda_exp * np.exp(-lambda_exp * x)
plt.plot(x, p_exp, 'r', linewidth=2, label='Theoretical Exponential Distribution')
plt.title('Histogram and Theoretical Exponential Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# 3. モンテカルロ法とシミュレーション
# モンテカルロ法によるπの推定
def monte_carlo_pi(num_samples):
np.random.seed(42)
x = np.random.uniform(-1, 1, num_samples)
y = np.random.uniform(-1, 1, num_samples)
inside_circle = x**2 + y**2 <= 1
pi_estimate = 4 * np.sum(inside_circle) / num_samples
return pi_estimate
num_samples = 10000
pi_estimate = monte_carlo_pi(num_samples)
print(f"Estimated Value of Pi: {pi_estimate:.4f}")
# プロット
x = np.random.uniform(-1, 1, num_samples)
y = np.random.uniform(-1, 1, num_samples)
inside_circle = x**2 + y**2 <= 1
plt.figure(figsize=(8, 8))
plt.scatter(x[inside_circle], y[inside_circle], color='blue', alpha=0.5, label='Inside Circle')
plt.scatter(x[~inside_circle], y[~inside_circle], color='red', alpha=0.5, label='Outside Circle')
plt.gca().set_aspect('equal', adjustable='box')
plt.title('Monte Carlo Simulation of Pi')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
# 1. 区間推定
# サンプルデータ
np.random.seed(42)
sample_data = np.random.normal(loc=10, scale=2, size=100)
mean_sample = np.mean(sample_data)
std_sample = np.std(sample_data, ddof=1)
confidence_level = 0.95
degrees_freedom = len(sample_data) - 1
confidence_interval = stats.t.interval(
confidence_level,
degrees_freedom,
mean_sample,
std_sample / np.sqrt(len(sample_data))
)
print(f"Confidence Interval: {confidence_interval}")
# プロット
plt.figure(figsize=(10, 6))
plt.hist(sample_data, bins=20, density=True, alpha=0.6, color='blue', label='Sample Data')
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mean_sample, std_sample)
plt.plot(x, p, 'k', linewidth=2, label='Theoretical Normal Distribution')
plt.axvline(confidence_interval[0], color='r', linestyle='--', label='Confidence Interval Lower Bound')
plt.axvline(confidence_interval[1], color='r', linestyle='--', label='Confidence Interval Upper Bound')
plt.title('Histogram with Confidence Interval')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()
# 2. 検定
# t検定(1標本)
t_stat, p_value = stats.ttest_1samp(sample_data, popmean=10)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
# カイ二乗検定
observed = np.array([50, 30, 20])
expected = np.array([40, 40, 20])
chi2_stat, chi2_p_value = stats.chisquare(observed, expected)
print(f"Chi-squared Statistic: {chi2_stat:.4f}, P-value: {chi2_p_value:.4f}")
# 3. 相関係数と回帰直線
# サンプルデータ
np.random.seed(42)
x = np.random.normal(5, 2, 100)
y = 2 * x + np.random.normal(0, 1, 100)
# 相関係数
corr_coefficient, _ = stats.pearsonr(x, y)
print(f"Pearson Correlation Coefficient: {corr_coefficient:.4f}")
# 回帰直線
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
# プロット
plt.figure(figsize=(10, 6))
plt.scatter(x, y, label='Data Points')
plt.plot(x, slope * x + intercept, color='red', label='Fitted Line')
plt.title('Scatter Plot with Regression Line')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True)
plt.show()
# 4. 分散分析(ANOVA)
# グループデータ
np.random.seed(42)
group1 = np.random.normal(10, 2, 30)
group2 = np.random.normal(12, 2, 30)
group3 = np.random.normal(14, 2, 30)
# ANOVA
f_statistic, p_value_anova = stats.f_oneway(group1, group2, group3)
print(f"ANOVA F-statistic: {f_statistic:.4f}, P-value: {p_value_anova:.4f}")
# プロット
plt.figure(figsize=(10, 6))
plt.boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
plt.title('ANOVA Boxplot')
plt.ylabel('Value')
plt.grid(True)
plt.show()
# 5. 適合度検定
# カテゴリデータのサンプル
np.random.seed(42)
data = np.random.choice(['A', 'B', 'C'], size=100, p=[0.2, 0.5, 0.3])
observed_counts = pd.value_counts(data)
expected_counts = np.array([20, 50, 30])
# 適合度検定
chi2_stat_goodness, chi2_p_value_goodness = stats.chisquare(observed_counts, expected_counts)
print(f"Goodness of Fit Chi-squared Statistic: {chi2_stat_goodness:.4f}, P-value: {chi2_p_value_goodness:.4f}")
# プロット
plt.figure(figsize=(10, 6))
plt.bar(observed_counts.index, observed_counts.values, alpha=0.6, color='blue', label='Observed')
plt.bar(['A', 'B', 'C'], expected_counts, alpha=0.6, color='red', label='Expected')
plt.title('Goodness of Fit Test')
plt.xlabel('Category')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.show()
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
# 1. 一元配置分散分析(One-way ANOVA)
# グループデータの生成
np.random.seed(42)
group1 = np.random.normal(10, 2, 30)
group2 = np.random.normal(12, 2, 30)
group3 = np.random.normal(14, 2, 30)
# ANOVA
f_statistic, p_value_anova = stats.f_oneway(group1, group2, group3)
print(f"ANOVA F-statistic: {f_statistic:.4f}, P-value: {p_value_anova:.4f}")
# プロット
plt.figure(figsize=(10, 6))
plt.boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
plt.title('One-way ANOVA Boxplot')
plt.ylabel('Value')
plt.grid(True)
plt.show()
# 2. 多重比較(TukeyのHSD検定)
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# 結合データの作成
data = pd.DataFrame({
'Value': np.concatenate([group1, group2, group3]),
'Group': ['Group 1'] * len(group1) + ['Group 2'] * len(group2) + ['Group 3'] * len(group3)
})
# TukeyのHSD検定
tukey = pairwise_tukeyhsd(endog=data['Value'], groups=data['Group'], alpha=0.05)
print(tukey)
# プロット
tukey.plot_simultaneous()
plt.show()
# 3. 二元配置分散分析(Two-way ANOVA)
# グループデータの生成
np.random.seed(42)
factor_A = np.random.choice(['A1', 'A2'], size=60)
factor_B = np.random.choice(['B1', 'B2'], size=60)
values = np.random.normal(10, 2, 60) + (factor_A == 'A2') * 3 + (factor_B == 'B2') * 2
# データフレームの作成
data_two_way = pd.DataFrame({
'Value': values,
'Factor_A': factor_A,
'Factor_B': factor_B
})
# ANOVA
import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('Value ~ C(Factor_A) * C(Factor_B)', data=data_two_way).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
# プロット
sns.catplot(x='Factor_A', y='Value', hue='Factor_B', kind='box', data=data_two_way)
plt.title('Two-way ANOVA Boxplot')
plt.show()
# 4. 要因計画と分散分析(簡単な例)
# 要因計画の実例データの生成
np.random.seed(42)
factor1 = np.random.choice(['F1', 'F2'], size=100)
factor2 = np.random.choice(['G1', 'G2'], size=100)
response = np.random.normal(5, 1, 100) + (factor1 == 'F2') * 2 + (factor2 == 'G2') * 3
# データフレームの作成
data_factorial = pd.DataFrame({
'Response': response,
'Factor1': factor1,
'Factor2': factor2
})
# ANOVA
model_factorial = ols('Response ~ C(Factor1) * C(Factor2)', data=data_factorial).fit()
anova_table_factorial = sm.stats.anova_lm(model_factorial, typ=2)
print(anova_table_factorial)
# プロット
sns.catplot(x='Factor1', y='Response', hue='Factor2', kind='box', data=data_factorial)
plt.title('Factorial Design Boxplot')
plt.show()
# 5. タグチメソッド(簡単な例)
# タグチメソッドに基づくデータ
np.random.seed(42)
factors = pd.DataFrame({
'Factor1': np.random.choice(['Level1', 'Level2'], size=20),
'Factor2': np.random.choice(['LevelA', 'LevelB'], size=20),
'Response': np.random.normal(50, 10, 20)
})
# データ分析
model_taguchi = ols('Response ~ C(Factor1) * C(Factor2)', data=factors).fit()
anova_table_taguchi = sm.stats.anova_lm(model_taguchi, typ=2)
print(anova_table_taguchi)
# プロット
sns.catplot(x='Factor1', y='Response', hue='Factor2', kind='box', data=factors)
plt.title('Taguchi Method Boxplot')
plt.show()
import nltk
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
# ダウンロード必要なNLTKデータ
nltk.download('stopwords')
nltk.download('punkt')
# サンプルデータ
documents = [
"I love natural language processing. It is a fascinating field of study.",
"Text mining involves analyzing texts to extract meaningful information.",
"NLP and text mining are closely related fields with many practical applications.",
"Machine learning techniques are often used in text mining and NLP tasks."
]
# テキストの前処理
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\d+', '', text)
text = re.sub(r'\W+', ' ', text)
tokens = nltk.word_tokenize(text)
stopwords = set(nltk.corpus.stopwords.words('english'))
tokens = [token for token in tokens if token not in stopwords]
return tokens
processed_docs = [preprocess_text(doc) for doc in documents]
# TF-IDFの計算
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
# TF-IDFの結果をデータフレームに変換
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
print("TF-IDF Matrix:")
print(tfidf_df)
# ヒートマップをプロット
plt.figure(figsize=(10, 6))
sns.heatmap(tfidf_df.T, cmap='viridis', annot=True, cbar=True, xticklabels=['Doc1', 'Doc2', 'Doc3', 'Doc4'])
plt.title('TF-IDF Heatmap')
plt.xlabel('Documents')
plt.ylabel('Terms')
plt.show()
# トピックモデリング (LDA)
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
# トピックの表示
topics = lda_model.print_topics(num_words=3)
print("\nLDA Topics:")
for topic in topics:
print(topic)
# トピックの可視化
def plot_lda_topics(lda_model, num_topics):
topics = lda_model.show_topics(formatted=False, num_words=10)
data = {}
for i, topic in topics:
words, probs = zip(*topic)
data[f'Topic {i+1}'] = probs
df = pd.DataFrame(data, index=words)
# プロット
df.plot(kind='bar', figsize=(12, 8))
plt.title('LDA Topic Probabilities')
plt.xlabel('Words')
plt.ylabel('Probability')
plt.show()
plot_lda_topics(lda_model, num_topics=2)
import numpy as np
from scipy.stats import mannwhitneyu, kruskal
import matplotlib.pyplot as plt
# Generate sample data (evaluation scores for literary works from different eras)
np.random.seed(0)
era1 = np.random.normal(loc=7, scale=1.5, size=30) # e.g., Showa period
era2 = np.random.normal(loc=8, scale=1.5, size=30) # e.g., Heisei period
# Mann-Whitney U test
stat, p_value = mannwhitneyu(era1, era2)
print("Mann-Whitney U Test Results")
print(f"U Statistic: {stat}")
print(f"p-value: {p_value}")
# Data visualization
plt.figure(figsize=(10, 6))
plt.boxplot([era1, era2], labels=['Showa Period', 'Heisei Period'])
plt.title('Evaluation Scores of Literary Works from Different Eras')
plt.ylabel('Evaluation Score')
plt.show()
import numpy as np
from scipy.stats import kruskal
import matplotlib.pyplot as plt
# Generate sample data (sentiment scores for literary works from different genres)
np.random.seed(0)
genre1 = np.random.normal(loc=6, scale=1.2, size=30) # e.g., Romance
genre2 = np.random.normal(loc=7, scale=1.2, size=30) # e.g., Historical
genre3 = np.random.normal(loc=5.5, scale=1.2, size=30) # e.g., Mystery
# Kruskal-Wallis test
stat, p_value = kruskal(genre1, genre2, genre3)
print("Kruskal-Wallis Test Results")
print(f"Statistic: {stat}")
print(f"p-value: {p_value}")
# Data visualization
plt.figure(figsize=(10, 6))
plt.boxplot([genre1, genre2, genre3], labels=['Romance', 'Historical', 'Mystery'])
plt.title('Sentiment Scores for Literary Works from Different Genres')
plt.ylabel('Sentiment Score')
plt.show()
import nltk
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
# NLTKデータのダウンロード
nltk.download('stopwords')
nltk.download('punkt')
# サンプルデータの作成
documents = [
"I love natural language processing. It is a fascinating field of study.",
"Text mining involves analyzing texts to extract meaningful information.",
"NLP and text mining are closely related fields with many practical applications.",
"Machine learning techniques are often used in text mining and NLP tasks."
]
# 1. コーパスの作成と前処理
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\d+', '', text)
text = re.sub(r'\W+', ' ', text)
tokens = nltk.word_tokenize(text)
stopwords = set(nltk.corpus.stopwords.words('english'))
tokens = [token for token in tokens if token not in stopwords]
return tokens
processed_docs = [preprocess_text(doc) for doc in documents]
# 2. 頻度分析とTF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
# TF-IDFの結果をデータフレームに変換
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)
print("TF-IDF Matrix:")
print(tfidf_df)
# ヒートマップをプロット
plt.figure(figsize=(10, 6))
sns.heatmap(tfidf_df.T, cmap='viridis', annot=True, cbar=True, xticklabels=['Doc1', 'Doc2', 'Doc3', 'Doc4'])
plt.title('TF-IDF Heatmap')
plt.xlabel('Documents')
plt.ylabel('Terms')
plt.show()
# 3. クラスタリング(KMeans)
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(tfidf_matrix)
clusters = kmeans.labels_
# クラスタリング結果の表示
print(f"KMeans Clusters: {clusters}")
# 4. トピックモデリング(LDA)
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
# トピックの表示
topics = lda_model.print_topics(num_words=3)
print("\nLDA Topics:")
for topic in topics:
print(topic)
# トピックの可視化
def plot_lda_topics(lda_model, num_topics):
topics = lda_model.show_topics(formatted=False, num_words=10)
data = {}
for i, topic in topics:
words, probs = zip(*topic)
data[f'Topic {i+1}'] = probs
df = pd.DataFrame(data, index=words)
# プロット
df.plot(kind='bar', figsize=(12, 8))
plt.title('LDA Topic Probabilities')
plt.xlabel('Words')
plt.ylabel('Probability')
plt.show()
plot_lda_topics(lda_model, num_topics=2)
# 5. データ収集とクリーニング
# 例として、テキストデータの収集方法を示す
def collect_data():
# この関数では、例えばウェブスクレイピングやAPIを用いてデータを収集する
collected_data = [
"Sample text data collected from various sources.",
"Another example of collected data for analysis."
]
return collected_data
# データのクリーニングと前処理
def clean_and_preprocess(data):
cleaned_data = [preprocess_text(text) for text in data]
return cleaned_data
collected_data = collect_data()
cleaned_data = clean_and_preprocess(collected_data)
print("Cleaned Data:")
print(cleaned_data)
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# 1. チェビシェフの不等式と大数の法則
# データの生成
np.random.seed(0)
data = np.random.normal(loc=0, scale=1, size=1000) # 平均0、標準偏差1の正規分布
mean = np.mean(data)
std_dev = np.std(data)
k_values = np.linspace(1, 3, 100)
prob_bound = 1 / (k_values ** 2)
# チェビシェフの不等式プロット
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(k_values, prob_bound, label='Chebyshev Bound')
plt.xlabel('k')
plt.ylabel('Probability Bound')
plt.title('Chebyshev\'s Inequality')
plt.grid(True)
plt.legend()
# 大数の法則プロット
sample_sizes = np.arange(10, 1000, 10)
sample_means = [np.mean(np.random.choice(data, size=n)) for n in sample_sizes]
plt.subplot(1, 2, 2)
plt.plot(sample_sizes, sample_means, label='Sample Mean', color='r')
plt.axhline(y=np.mean(data), color='g', linestyle='--', label='True Mean')
plt.xlabel('Sample Size')
plt.ylabel('Sample Mean')
plt.title('Law of Large Numbers')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
# 2. ボアソン分布
# パラメータ
lambda_param = 5
# 確率密度関数のプロット
x = np.arange(0, 15)
poisson_pmf = stats.poisson.pmf(x, lambda_param)
plt.figure(figsize=(6, 4))
plt.bar(x, poisson_pmf, color='blue', alpha=0.7, label=f'λ = {lambda_param}')
plt.xlabel('k')
plt.ylabel('Probability')
plt.title('Poisson Distribution PMF')
plt.grid(True)
plt.legend()
plt.show()
# 3. カイ二乗分布とt分布の確率密度関数
# カイ二乗分布
df_chi2 = 5 # 自由度
x_chi2 = np.linspace(0, 20, 1000)
chi2_pdf = stats.chi2.pdf(x_chi2, df_chi2)
# t分布
df_t = 10 # 自由度
x_t = np.linspace(-5, 5, 1000)
t_pdf = stats.t.pdf(x_t, df_t)
# プロット
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(x_chi2, chi2_pdf, color='purple')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.title('Chi-Squared Distribution PDF')
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(x_t, t_pdf, color='orange')
plt.xlabel('x')
plt.ylabel('Probability Density')
plt.title('Student\'s t-Distribution PDF')
plt.grid(True)
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# 1. 確率とその基本性質
# 1.1 事象と確率
def probability(event, sample_space):
return len(event) / len(sample_space)
# サンプルスペースの定義
sample_space = ['HH', 'HT', 'TH', 'TT'] # 2回のコイン投げのサンプルスペース
# 事象の定義
event_A = ['HH', 'HT'] # A: 最初のコインが表
event_B = ['HH', 'TH'] # B: 最初のコインが表、または最後のコインが表
# 確率の計算
p_A = probability(event_A, sample_space)
p_B = probability(event_B, sample_space)
p_A_and_B = probability(list(set(event_A) & set(event_B)), sample_space)
p_A_or_B = probability(list(set(event_A) | set(event_B)), sample_space)
print(f"P(A) = {p_A}")
print(f"P(B) = {p_B}")
print(f"P(A ∩ B) = {p_A_and_B}")
print(f"P(A ∪ B) = {p_A_or_B}")
# 1.2 確率の基本性質
def is_valid_probability(p):
return 0 <= p <= 1
# 確率の検証
print("Is P(A) valid?", is_valid_probability(p_A))
print("Is P(B) valid?", is_valid_probability(p_B))
# 2. いろいろな確率の計算
# 2.1 独立試行とその確率
def independent_prob(p1, p2):
return p1 * p2
# 確率の定義
p_HH = 0.5
p_TT = 0.5
# 独立試行の確率
p_HH_TT = independent_prob(p_HH, p_TT)
print(f"P(HH and TT) = {p_HH_TT}")
# 2.2 反復試行とその確率
def binomial_probability(n, k, p):
return stats.binom.pmf(k, n, p)
# 確率の定義
n_trials = 10
p_success = 0.5
k_success = 5
# 反復試行の確率
p_binomial = binomial_probability(n_trials, k_success, p_success)
print(f"P(X = {k_success}) in {n_trials} trials = {p_binomial}")
# 2.3 条件付き確率
def conditional_probability(p_A_and_B, p_A):
return p_A_and_B / p_A
# 条件付き確率の計算
p_A_given_B = conditional_probability(p_A_and_B, p_A)
print(f"P(A | B) = {p_A_given_B}")
# 2.4 いろいろな確率の計算
# 例: 多項分布の確率計算
p = [0.2, 0.5, 0.3] # 各カテゴリの確率
n = 10 # 試行回数
prob = stats.multinomial.pmf([2, 5, 3], n, p)
print(f"Multinomial probability = {prob}")
# 研究 1: 確率と漸化式
def probability_recurrence(n, p):
probabilities = [p**n * (1 - p)**(n - i) * stats.binom.pmf(i, n, p) for i in range(n + 1)]
return probabilities
# 例: 確率の漸化式
n = 5
p = 0.5
prob_recurrence = probability_recurrence(n, p)
plt.figure(figsize=(8, 4))
plt.bar(range(n + 1), prob_recurrence, color='blue', alpha=0.7)
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.title('Probability Recurrence')
plt.grid(True)
plt.show()
# 研究 2: 反復試行の確率の最大値
def max_binomial_prob(n, p):
k_values = np.arange(0, n + 1)
probs = binomial_probability(n, k_values, p)
max_prob = np.max(probs)
max_k = k_values[np.argmax(probs)]
return max_k, max_prob
# 例: 反復試行の確率の最大値
n = 10
p = 0.5
max_k, max_prob = max_binomial_prob(n, p)
print(f"最大確率は k = {max_k} のときで、確率は {max_prob}")
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# 1. 確率分布
# 1.1 確率変数と確率分布
# 確率変数とその分布の例を示します。
# 例: サイコロを振る確率変数の分布
def dice_distribution():
outcomes = np.arange(1, 7) # サイコロの目
probabilities = np.ones_like(outcomes) / len(outcomes) # 各目の確率
return outcomes, probabilities
outcomes, probabilities = dice_distribution()
plt.figure(figsize=(6, 4))
plt.bar(outcomes, probabilities, color='skyblue', alpha=0.7)
plt.xlabel('Outcomes')
plt.ylabel('Probability')
plt.title('Dice Distribution')
plt.grid(True)
plt.show()
# 1.2 二項分布
def binomial_distribution(n, p):
x = np.arange(0, n+1)
pmf = stats.binom.pmf(x, n, p)
return x, pmf
# パラメータ
n_trials = 10
p_success = 0.5
# 二項分布の確率質量関数(PMF)
x_binomial, pmf_binomial = binomial_distribution(n_trials, p_success)
plt.figure(figsize=(6, 4))
plt.bar(x_binomial, pmf_binomial, color='lightcoral', alpha=0.7)
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.title('Binomial Distribution PMF')
plt.grid(True)
plt.show()
# 2. 正規分布
# 2.1 正規分布
def normal_distribution(mu, sigma, size=1000):
data = np.random.normal(mu, sigma, size)
return data
# パラメータ
mu = 0
sigma = 1
data_normal = normal_distribution(mu, sigma)
# 正規分布のヒストグラムと確率密度関数(PDF)
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000)
pdf = stats.norm.pdf(x, mu, sigma)
plt.figure(figsize=(12, 6))
# ヒストグラム
plt.subplot(1, 2, 1)
plt.hist(data_normal, bins=30, density=True, color='lightgreen', alpha=0.7, edgecolor='black')
plt.plot(x, pdf, 'r-', lw=2)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normal Distribution Histogram')
plt.grid(True)
# 正規分布の累積分布関数(CDF)
cdf = stats.norm.cdf(x, mu, sigma)
plt.subplot(1, 2, 2)
plt.plot(x, cdf, 'b-', lw=2)
plt.xlabel('Value')
plt.ylabel('CDF')
plt.title('Normal Distribution CDF')
plt.grid(True)
plt.tight_layout()
plt.show()
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# サンプルデータの作成
data = np.random.rand(10, 10)
# ヒートマップの作成
plt.figure(figsize=(8, 8))
sns.heatmap(data, cmap='magma', annot=True, fmt=".2f", linewidths=.5)
plt.title('Heatmap Art')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# サンプルデータの作成
angles = np.linspace(0, 2 * np.pi, 100)
radii = np.abs(np.sin(angles) * np.cos(angles))
# 楕円形状の作成
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, projection='polar')
ax.plot(angles, radii, color='cyan', lw=2)
ax.fill(angles, radii, color='cyan', alpha=0.5)
plt.title('Polar Plot Art')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# サンプルデータの作成
np.random.seed(0)
x = np.random.rand(100)
y = np.random.rand(100)
colors = np.random.rand(100)
sizes = 1000 * np.random.rand(100)
# 散布図の作成
plt.figure(figsize=(10, 8))
plt.scatter(x, y, c=colors, s=sizes, alpha=0.5, cmap='viridis')
plt.colorbar() # カラーバーを追加
plt.title('Scatter Plot Art')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()
import plotly.express as px
import pandas as pd
# サンプルデータの作成
df = pd.DataFrame({
'x': np.random.rand(100),
'y': np.random.rand(100),
'color': np.random.rand(100)
})
# 散布図の作成
fig = px.scatter(df, x='x', y='y', color='color', size='color',
color_continuous_scale='Rainbow', title='Interactive Scatter Plot Art')
fig.show()
import numpy as np
import matplotlib.pyplot as plt
# マンデルブロ集合の計算
def mandelbrot(c, max_iter=1000):
z = c
for n in range(max_iter):
if abs(z) > 2:
return n
z = z*z + c
return max_iter
x = np.linspace(-2.5, 1.5, 1000)
y = np.linspace(-2.0, 2.0, 1000)
X, Y = np.meshgrid(x, y)
C = X + 1j * Y
M = np.array([mandelbrot(c) for c in C.flatten()])
M = M.reshape(X.shape)
# フラクタルアートの描画
plt.figure(figsize=(10, 10))
plt.imshow(M, extent=(-2.5, 1.5, -2.0, 2.0), cmap='inferno', interpolation='bilinear')
plt.title('Mandelbrot Set Art')
plt.axis('off')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
# データの生成
x = np.linspace(0, 2 * np.pi, 100)
y = np.sin(x)
# アニメーションの設定
fig, ax = plt.subplots()
line, = ax.plot(x, y, color='cyan')
def update(frame):
line.set_ydata(np.sin(x + frame / 10.0))
return line,
ani = animation.FuncAnimation(fig, update, frames=100, interval=50, blit=True)
plt.title('Sinusoidal Animation Art')
plt.show()