0
0

データサイエンス統計

Posted at
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Sample data
data = np.random.normal(0, 1, 1000)

# Statistical metrics
mean = np.mean(data)
variance = np.var(data)
std_dev = np.std(data)
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)
data_min = np.min(data)
data_max = np.max(data)

# Confidence interval (95%)
confidence = stats.norm.interval(0.95, loc=mean, scale=std_dev/np.sqrt(len(data)))

# Print metrics
print(f'Mean: {mean}')
print(f'Variance: {variance}')
print(f'Standard Deviation: {std_dev}')
print(f'Skewness: {skewness}')
print(f'Kurtosis: {kurtosis}')
print(f'Minimum: {data_min}')
print(f'Maximum: {data_max}')
print(f'95% Confidence Interval: {confidence}')

# Plotting
plt.hist(data, bins=30, alpha=0.6, color='g', edgecolor='black')
plt.axvline(mean, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean:.2f}')
plt.axvline(data_min, color='blue', linestyle='dashed', linewidth=2, label=f'Min: {data_min:.2f}')
plt.axvline(data_max, color='purple', linestyle='dashed', linewidth=2, label=f'Max: {data_max:.2f}')
plt.axvline(confidence[0], color='orange', linestyle='dashed', linewidth=2, label=f'95% CI: {confidence[0]:.2f}')
plt.axvline(confidence[1], color='orange', linestyle='dashed', linewidth=2)
plt.legend()
plt.title('Data Distribution with Statistical Metrics')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Study hours data (example dataset)
study_hours = np.array([2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20])

# Statistical metrics
mean = np.mean(study_hours)
variance = np.var(study_hours)
std_dev = np.std(study_hours)
skewness = stats.skew(study_hours)
kurtosis = stats.kurtosis(study_hours)
mode = stats.mode(study_hours)[0]  # Access mode directly
data_min = np.min(study_hours)
q1 = np.percentile(study_hours, 25)
median = np.median(study_hours)
q3 = np.percentile(study_hours, 75)
data_max = np.max(study_hours)
iqr = q3 - q1  # Interquartile range

# Standardization
standardized_data = (study_hours - mean) / std_dev

# Print statistical summary
print(f'Mean: {mean}')
print(f'Variance: {variance}')
print(f'Standard Deviation: {std_dev}')
print(f'Skewness: {skewness}')
print(f'Kurtosis: {kurtosis}')

print(f'Minimum: {data_min}')
print(f'1st Quartile: {q1}')
print(f'Median: {median}')
print(f'3rd Quartile: {q3}')
print(f'Maximum: {data_max}')
print(f'Interquartile Range: {iqr}')

# Boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(study_hours, vert=False)
plt.title('Boxplot of Study Hours')
plt.xlabel('Study Hours')
plt.show()

# Plot of standardized data
plt.figure(figsize=(10, 6))
sns.kdeplot(standardized_data, color='blue', label='Standardized Data', fill=True)
plt.axvline(0, color='red', linestyle='dashed', linewidth=2, label='Mean (Standardized)')
plt.title('Density Plot of Standardized Study Hours')
plt.xlabel('Standardized Value')
plt.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from skimage.transform import radon
from skimage.data import shepp_logan_phantom

# Create a sample phantom image (Shepp-Logan phantom used in tomography)
image = shepp_logan_phantom()

# Define the angles for the tomographic lines (projection angles)
theta = np.linspace(0., 180., max(image.shape), endpoint=False)

# Apply the Radon transform (tomography)
sinogram = radon(image, theta=theta, circle=True)

# Plot the original image and the Radon transform (tomographic lines)
plt.figure(figsize=(10, 8))

# Original image (phantom)
plt.subplot(1, 2, 1)
plt.imshow(image, cmap='gray')
plt.title('Original Image (Phantom)')
plt.axis('off')

# Radon transform (Sinogram)
plt.subplot(1, 2, 2)
plt.imshow(sinogram, cmap='gray', aspect='auto', extent=(0, 180, 0, sinogram.shape[0]))
plt.title('Radon Transform (Sinogram)')
plt.xlabel('Projection Angle (degrees)')
plt.ylabel('Projection Position')

plt.tight_layout()
plt.show()

import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt

# Sample data (X: independent variable, Y: dependent variable)
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])
Y = np.array([45, 47, 49, 52, 55, 57, 59, 62, 64, 67, 70, 72, 75, 77, 80, 82, 85, 87, 90, 92, 95, 97, 100, 102, 105, 107, 110, 112, 115, 118])

# Add a constant to the independent variable (for intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(Y, X)
results = model.fit()

# Print regression statistics
print(f'Multiple Correlation R: {np.sqrt(results.rsquared)}')
print(f'R^2: {results.rsquared}')
print(f'Adjusted R^2: {results.rsquared_adj}')
print(f'Standard Error: {np.sqrt(results.mse_resid)}')
print(f'F-statistic: {results.fvalue}')
print(f'P-value of F-statistic: {results.f_pvalue}')
print(f'Number of observations: {results.nobs}')



# Confidence intervals (95%)
conf_intervals = results.conf_int(0.05)
print(f'95% Confidence Intervals:\n{conf_intervals}')

# Plotting the regression line with confidence intervals
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 1], Y, label='Observed data', color='blue')
plt.plot(X[:, 1], results.fittedvalues, color='red', label=f'Regression line (R² = {results.rsquared:.2f})')
plt.fill_between(X[:, 1], conf_intervals[0, 0] + conf_intervals[1, 0] * X[:, 1], 
                 conf_intervals[0, 1] + conf_intervals[1, 1] * X[:, 1], color='gray', alpha=0.2, label='95% Confidence Interval')
plt.title('Linear Regression with 95% Confidence Interval')
plt.xlabel('Independent Variable (X)')
plt.ylabel('Dependent Variable (Y)')
plt.legend()
plt.show()

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Sample data (X: independent variable, Y: dependent variable)
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
Y = np.array([3, 4, 5, 7, 9, 11, 13, 14, 15, 19])

# Add a constant to the independent variable (for intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(Y, X)
results = model.fit()

# Predicted values
Y_pred = results.fittedvalues

# Calculate Sum of Squares
SST = np.sum((Y - np.mean(Y)) ** 2)  # Total Sum of Squares
SSR = np.sum((Y_pred - np.mean(Y)) ** 2)  # Sum of Squares for Regression
SSE = np.sum((Y - Y_pred) ** 2)  # Residual Sum of Squares (Error)

# Calculate R^2 and Adjusted R^2
R_squared = SSR / SST
adjusted_R_squared = 1 - ((1 - R_squared) * (len(Y) - 1) / (len(Y) - X.shape[1]))

# Standard error of the residuals
n = len(Y)
p = X.shape[1] - 1  # number of predictors
residual_std_error = np.sqrt(SSE / (n - p - 1))

# Multiple correlation coefficient
R = np.sqrt(R_squared)

# Print the statistics
print(f"SSR (Sum of Squares for Regression): {SSR}")
print(f"SSE (Residual Sum of Squares): {SSE}")
print(f"SST (Total Sum of Squares): {SST}")
print(f"R^2: {R_squared}")
print(f"Adjusted R^2: {adjusted_R_squared}")
print(f"Residual Standard Error: {residual_std_error}")
print(f"Multiple Correlation Coefficient (R): {R}")

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(Y, Y_pred, color='blue', label='Predicted vs Actual')
plt.plot([min(Y), max(Y)], [min(Y), max(Y)], color='red', label='Perfect Fit')
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, binom, beta
from scipy.integrate import quad

# Parameters
n = 20  # Number of trials for binomial distribution
p = 0.5  # Probability of success for binomial distribution
a, b = 2, 5  # Parameters for beta distribution

# Normal Distribution
x = np.linspace(-4, 4, 1000)
pdf_normal = norm.pdf(x, loc=0, scale=1)

# Binomial Distribution
k = np.arange(0, n+1)
pmf_binom = binom.pmf(k, n, p)

# Beta Distribution
x_beta = np.linspace(0, 1, 1000)
pdf_beta = beta.pdf(x_beta, a, b)

# Hazard Function and Instantaneous Failure Rate
def hazard_function(t, beta=1.5, eta=10):
    return (beta / eta) * (t / eta) ** (beta - 1)

def instantaneous_failure_rate(t, beta=1.5, eta=10):
    return hazard_function(t, beta, eta)

# Time values
t = np.linspace(0.1, 30, 500)
hazard = hazard_function(t)
instantaneous_failure = instantaneous_failure_rate(t)

# Plot Normal Distribution
plt.figure(figsize=(14, 10))

plt.subplot(2, 3, 1)
plt.plot(x, pdf_normal, color='blue')
plt.title('Normal Distribution PDF')
plt.xlabel('x')
plt.ylabel('Probability Density')

# Plot Binomial Distribution
plt.subplot(2, 3, 2)
plt.stem(k, pmf_binom, basefmt='C0-', use_line_collection=True)
plt.title('Binomial Distribution PMF')
plt.xlabel('k')
plt.ylabel('Probability')

# Plot Beta Distribution
plt.subplot(2, 3, 3)
plt.plot(x_beta, pdf_beta, color='green')
plt.title('Beta Distribution PDF')
plt.xlabel('x')
plt.ylabel('Probability Density')

# Plot Hazard Function
plt.subplot(2, 3, 4)
plt.plot(t, hazard, color='red')
plt.title('Hazard Function')
plt.xlabel('Time')
plt.ylabel('Hazard Rate')

# Plot Instantaneous Failure Rate
plt.subplot(2, 3, 5)
plt.plot(t, instantaneous_failure, color='purple')
plt.title('Instantaneous Failure Rate')
plt.xlabel('Time')
plt.ylabel('Failure Rate')

# Plot Failure Periods
plt.subplot(2, 3, 6)
plt.plot(t, hazard, label='Early Failure Period', color='orange')
plt.plot(t, np.ones_like(t) * 0.5, label='Random Failure Period', color='cyan')
plt.plot(t, np.exp(-0.1 * t), label='Wear-Out Failure Period', color='magenta')
plt.title('Failure Periods')
plt.xlabel('Time')
plt.ylabel('Failure Rate')
plt.legend()

plt.tight_layout()
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import expon, chi2
from scipy.optimize import newton

# Parameters
mu = 20.6  # Estimated parameter for the exponential distribution
n = 10     # Number of observations
alpha = 0.05  # Significance level for confidence interval

# Generate sample data
np.random.seed(0)
data = np.random.exponential(scale=mu, size=n)
W = np.sum(data)  # Sum of the observations

# Calculate the maximum likelihood estimate of mu
mu_hat = W / n

# Variance and standard deviation of the exponential distribution
variance = mu**2
std_dev = mu

# Chi-squared distribution for confidence intervals
df = 2 * n
chi2_upper = chi2.ppf(1 - alpha/2, df)
chi2_lower = chi2.ppf(alpha/2, df)

# Confidence interval for mu
CI_lower = (2 * W) / chi2_upper
CI_upper = (2 * W) / chi2_lower

# Probability density function for exponential distribution
x = np.linspace(0, 2 * mu, 1000)
pdf_exp = expon.pdf(x, scale=mu)

# Plot Exponential Distribution PDF
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(x, pdf_exp, color='blue')
plt.title('Exponential Distribution PDF')
plt.xlabel('x')
plt.ylabel('Probability Density')

# Plot Confidence Interval
plt.subplot(2, 2, 2)
plt.fill_between([CI_lower, CI_upper], [0, 0], [1, 1], color='yellow', alpha=0.5)
plt.axvline(CI_lower, color='red', linestyle='--', label='Lower CI Bound')
plt.axvline(CI_upper, color='green', linestyle='--', label='Upper CI Bound')
plt.title('Confidence Interval for $\mu$')
plt.xlabel('Value of $\mu$')
plt.ylabel('Density')
plt.legend()

# Plot Histogram of Data
plt.subplot(2, 2, 3)
plt.hist(data, bins=10, density=True, alpha=0.6, color='g', edgecolor='black')
plt.plot(x, pdf_exp, color='blue', lw=2)
plt.title('Histogram of Sample Data with Exponential PDF')
plt.xlabel('x')
plt.ylabel('Density')

# Plot Chi-Squared Distribution for Confidence Interval
plt.subplot(2, 2, 4)
chi2_values = np.linspace(0, 50, 1000)
chi2_pdf = chi2.pdf(chi2_values, df)
plt.plot(chi2_values, chi2_pdf, color='purple')
plt.axvline(chi2_upper, color='red', linestyle='--', label='Upper Bound (Chi-squared)')
plt.axvline(chi2_lower, color='green', linestyle='--', label='Lower Bound (Chi-squared)')
plt.title('Chi-Squared Distribution for Confidence Interval')
plt.xlabel('Value')
plt.ylabel('Probability Density')
plt.legend()

plt.tight_layout()
plt.show()

print(f"Estimated parameter (mu): {mu_hat:.2f}")
print(f"95% Confidence Interval for mu: ({CI_lower:.2f}, {CI_upper:.2f})")

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal, beta
from matplotlib.patches import Ellipse

# 2D Normal Distribution Parameters
mean = [0, 0]
cov = [[1, 0.5], [0.5, 1]]  # Covariance matrix

# 2D Beta Distribution Parameters
alpha = 2.0
beta_param = 5.0

# Create a grid of points
x = np.linspace(-4, 4, 100)
y = np.linspace(-4, 4, 100)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))

# Compute PDF for 2D Normal Distribution
rv = multivariate_normal(mean, cov)
pdf_normal = rv.pdf(pos)

# Compute PDF for 2D Beta Distribution
# Beta distribution is defined on the interval [0,1], so we map the coordinates to [0,1]
X_beta = (X - np.min(X)) / (np.max(X) - np.min(X))
Y_beta = (Y - np.min(Y)) / (np.max(Y) - np.min(Y))
pdf_beta = beta.pdf(X_beta, alpha, beta_param) * beta.pdf(Y_beta, alpha, beta_param)

# Plot 2D Normal Distribution
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.contourf(X, Y, pdf_normal, cmap='viridis')
plt.colorbar(label='Probability Density')
plt.title('2D Normal Distribution')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

# Plot 2D Beta Distribution
plt.subplot(1, 2, 2)
plt.contourf(X, Y, pdf_beta, cmap='plasma')
plt.colorbar(label='Probability Density')
plt.title('2D Beta Distribution')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

plt.tight_layout()
plt.show()

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0