Pythonで異常値の検出：３つの方法

Posted at 2024-12-16

何らかの測定データの異常値を検出する方法をサンプルデータを用いて３つ示す．

移動平均法

MovingAve_detect_outlier.py

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# サンプルデータ: 30日間の測定データ
production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

def detect_outliers_moving_average(data, window=5, n_sigmas=2):
    df = pd.DataFrame(data, columns=['value'])
    df['moving_avg'] = df['value'].rolling(window=window, center=True).mean()
    df['moving_std'] = df['value'].rolling(window=window, center=True).std()
    
    df['lower_bound'] = df['moving_avg'] - n_sigmas * df['moving_std']
    df['upper_bound'] = df['moving_avg'] + n_sigmas * df['moving_std']
    
    df['is_outlier'] = (df['value'] < df['lower_bound']) | (df['value'] > df['upper_bound'])
    
    outliers = df[df['is_outlier']].index.tolist()
    return df, outliers

# 異常値の検出
df, outliers = detect_outliers_moving_average(production_data)

print("Detected Outliers:")
for day in outliers:
    print(f"Date: {day+1}, Volume: {production_data[day]}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Volume')
plt.plot(range(1, 31), df['moving_avg'], label='Moving Average')
plt.fill_between(range(1, 31), df['lower_bound'], df['upper_bound'], alpha=0.2, label='Normal range')

for day in outliers:
    plt.plot(day+1, production_data[day], 'ro', markersize=10)

plt.title('Outlier Detection by Moving Average')
plt.xlabel('Day')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.show()

IQR

IQR_detect_outlier.py

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

def detect_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return [(i, x) for i, x in enumerate(data) if x < lower_bound or x > upper_bound]

# 異常値の検出
outliers = detect_outliers_iqr(production_data)

print("Detected Outliers : ")
for day, value in outliers:
    print(f"Date: {day+1}, Volume: {value}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Production Volume')
for day, value in outliers:
    plt.plot(day+1, value, 'ro', markersize=10, label='Outlier' if day == outliers[0][0] else "")

# IQRの範囲を表示
q1 = np.percentile(production_data, 25)
q3 = np.percentile(production_data, 75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
plt.axhline(y=lower_bound, color='r', linestyle='--', label='IQR')
plt.axhline(y=upper_bound, color='r', linestyle='--')

plt.title('Production Volume and Outlier Detection by IQR')
plt.xlabel('Date')
plt.ylabel('Production Volume')
plt.legend()
plt.grid(True)
plt.show()

Ｚ Score

zscore_detect_outlier.py

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# サンプルデータ: 30日間の測定データ
production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

# Zスコアを計算する関数
def calculate_zscore(data):
    mean = np.mean(data)
    std = np.std(data)
    return [(x - mean) / std for x in data]

# Zスコアの計算
zscore = calculate_zscore(production_data)

# 異常値の検出（閾値は2.5とする）
threshold = 2.5
outliers = [(i, production_data[i]) for i, zscore in enumerate(zscore) if abs(zscore) > threshold]

print("Detected Outliers : ")
for day, value in outliers:
    print(f"Date: {day+1}, Volume: {value}, Zscore: {zscore[day]:.2f}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o')
for day, value in outliers:
    plt.plot(day+1, value, 'ro', markersize=10)  # 異常値を赤で表示

plt.title('Measured data')
plt.xlabel('Day')
plt.ylabel('Value')
plt.grid(True)
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up