0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Pythonで異常値の検出:3つの方法

Posted at

何らかの測定データの異常値を検出する方法をサンプルデータを用いて3つ示す.

移動平均法

MovingAve_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# サンプルデータ: 30日間の測定データ
production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

def detect_outliers_moving_average(data, window=5, n_sigmas=2):
    df = pd.DataFrame(data, columns=['value'])
    df['moving_avg'] = df['value'].rolling(window=window, center=True).mean()
    df['moving_std'] = df['value'].rolling(window=window, center=True).std()
    
    df['lower_bound'] = df['moving_avg'] - n_sigmas * df['moving_std']
    df['upper_bound'] = df['moving_avg'] + n_sigmas * df['moving_std']
    
    df['is_outlier'] = (df['value'] < df['lower_bound']) | (df['value'] > df['upper_bound'])
    
    outliers = df[df['is_outlier']].index.tolist()
    return df, outliers

# 異常値の検出
df, outliers = detect_outliers_moving_average(production_data)

print("Detected Outliers:")
for day in outliers:
    print(f"Date: {day+1}, Volume: {production_data[day]}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Volume')
plt.plot(range(1, 31), df['moving_avg'], label='Moving Average')
plt.fill_between(range(1, 31), df['lower_bound'], df['upper_bound'], alpha=0.2, label='Normal range')

for day in outliers:
    plt.plot(day+1, production_data[day], 'ro', markersize=10)

plt.title('Outlier Detection by Moving Average')
plt.xlabel('Day')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.show()

スクリーンショット 2024-11-07 10.42.54.jpg

IQR

IQR_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

def detect_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return [(i, x) for i, x in enumerate(data) if x < lower_bound or x > upper_bound]

# 異常値の検出
outliers = detect_outliers_iqr(production_data)

print("Detected Outliers : ")
for day, value in outliers:
    print(f"Date: {day+1}, Volume: {value}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o', label='Production Volume')
for day, value in outliers:
    plt.plot(day+1, value, 'ro', markersize=10, label='Outlier' if day == outliers[0][0] else "")

# IQRの範囲を表示
q1 = np.percentile(production_data, 25)
q3 = np.percentile(production_data, 75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
plt.axhline(y=lower_bound, color='r', linestyle='--', label='IQR')
plt.axhline(y=upper_bound, color='r', linestyle='--')

plt.title('Production Volume and Outlier Detection by IQR')
plt.xlabel('Date')
plt.ylabel('Production Volume')
plt.legend()
plt.grid(True)
plt.show()

スクリーンショット 2024-11-07 10.45.43.jpg

Z Score

zscore_detect_outlier.py
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# サンプルデータ: 30日間の測定データ
production_data = [
    520, 510, 515, 530, 525, 535, 540, 515, 525, 530,
    520, 510, 515, 525, 530, 535, 540, 545, 550, 555,
    530, 525, 520, 515, 510, 505, 500, 495, 490, 700  # 最後のデータが異常に高い
]

# Zスコアを計算する関数
def calculate_zscore(data):
    mean = np.mean(data)
    std = np.std(data)
    return [(x - mean) / std for x in data]

# Zスコアの計算
zscore = calculate_zscore(production_data)

# 異常値の検出(閾値は2.5とする)
threshold = 2.5
outliers = [(i, production_data[i]) for i, zscore in enumerate(zscore) if abs(zscore) > threshold]

print("Detected Outliers : ")
for day, value in outliers:
    print(f"Date: {day+1}, Volume: {value}, Zscore: {zscore[day]:.2f}")

# データのプロット
plt.figure(figsize=(12, 6))
plt.plot(range(1, 31), production_data, marker='o')
for day, value in outliers:
    plt.plot(day+1, value, 'ro', markersize=10)  # 異常値を赤で表示

plt.title('Measured data')
plt.xlabel('Day')
plt.ylabel('Value')
plt.grid(True)
plt.show()

スクリーンショット 2024-11-07 10.39.58.jpg

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?