何らかの測定データの異常値を検出する方法をサンプルデータを用いて3つ示す.
移動平均法
MovingAve_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# サンプルデータ: 30日間の測定データ
production_data = [
520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
530, 525, 520, 515, 510, 505, 500, 495, 490, 700 # 最後のデータが異常に高い
]
def detect_outliers_moving_average(data, window=5, n_sigmas=2):
df = pd.DataFrame(data, columns=['value'])
df['moving_avg'] = df['value'].rolling(window=window, center=True).mean()
df['moving_std'] = df['value'].rolling(window=window, center=True).std()
df['lower_bound'] = df['moving_avg'] - n_sigmas * df['moving_std']
df['upper_bound'] = df['moving_avg'] + n_sigmas * df['moving_std']
df['is_outlier'] = (df['value'] < df['lower_bound']) | (df['value'] > df['upper_bound'])
outliers = df[df['is_outlier']].index.tolist()
return df, outliers
# 異常値の検出
df, outliers = detect_outliers_moving_average(production_data)
print("Detected Outliers:")
for day in outliers:
print(f"Date: {day+1}, Volume: {production_data[day]}")
# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Volume')
plt.plot(range(1, 31), df['moving_avg'], label='Moving Average')
plt.fill_between(range(1, 31), df['lower_bound'], df['upper_bound'], alpha=0.2, label='Normal range')
for day in outliers:
plt.plot(day+1, production_data[day], 'ro', markersize=10)
plt.title('Outlier Detection by Moving Average')
plt.xlabel('Day')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.show()
IQR
IQR_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
production_data = [
520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
530, 525, 520, 515, 510, 505, 500, 495, 490, 700 # 最後のデータが異常に高い
]
def detect_outliers_iqr(data):
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
return [(i, x) for i, x in enumerate(data) if x < lower_bound or x > upper_bound]
# 異常値の検出
outliers = detect_outliers_iqr(production_data)
print("Detected Outliers : ")
for day, value in outliers:
print(f"Date: {day+1}, Volume: {value}")
# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Production Volume')
for day, value in outliers:
plt.plot(day+1, value, 'ro', markersize=10, label='Outlier' if day == outliers[0][0] else "")
# IQRの範囲を表示
q1 = np.percentile(production_data, 25)
q3 = np.percentile(production_data, 75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
plt.axhline(y=lower_bound, color='r', linestyle='--', label='IQR')
plt.axhline(y=upper_bound, color='r', linestyle='--')
plt.title('Production Volume and Outlier Detection by IQR')
plt.xlabel('Date')
plt.ylabel('Production Volume')
plt.legend()
plt.grid(True)
plt.show()
Z Score
zscore_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
# サンプルデータ: 30日間の測定データ
production_data = [
520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
530, 525, 520, 515, 510, 505, 500, 495, 490, 700 # 最後のデータが異常に高い
]
# Zスコアを計算する関数
def calculate_zscore(data):
mean = np.mean(data)
std = np.std(data)
return [(x - mean) / std for x in data]
# Zスコアの計算
zscore = calculate_zscore(production_data)
# 異常値の検出(閾値は2.5とする)
threshold = 2.5
outliers = [(i, production_data[i]) for i, zscore in enumerate(zscore) if abs(zscore) > threshold]
print("Detected Outliers : ")
for day, value in outliers:
print(f"Date: {day+1}, Volume: {value}, Zscore: {zscore[day]:.2f}")
# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o')
for day, value in outliers:
plt.plot(day+1, value, 'ro', markersize=10) # 異常値を赤で表示
plt.title('Measured data')
plt.xlabel('Day')
plt.ylabel('Value')
plt.grid(True)
plt.show()