🎄 科学と神々株式会社 アドベントカレンダー 2025
Hybrid License System Day 9: ヘルスチェックと監視
API Gateway編 (4/5)
📖 はじめに
Day 9では、ヘルスチェックと監視を学びます。ヘルスチェックエンドポイント、サービス監視、Prometheusメトリクス収集、アラート設定を実装しましょう。
🏥 ヘルスチェックエンドポイント
基本的なヘルスチェック
// api-gateway/src/routes/health.js
const express = require('express');
const router = express.Router();
// シンプルなヘルスチェック
router.get('/health', (req, res) => {
res.status(200).json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime()
});
});
module.exports = router;
詳細なヘルスチェック
// 依存サービスを含むヘルスチェック
router.get('/health/detailed', async (req, res) => {
const health = {
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
services: {
'auth-service': { status: 'unknown', latency: null },
'admin-service': { status: 'unknown', latency: null }
},
memory: process.memoryUsage(),
cpu: process.cpuUsage()
};
// Auth Serviceのヘルスチェック
try {
const startAuth = Date.now();
const authResponse = await axios.get(
`${AUTH_SERVICE_URL}/health`,
{ timeout: 3000 }
);
const latency = Date.now() - startAuth;
health.services['auth-service'] = {
status: authResponse.status === 200 ? 'healthy' : 'unhealthy',
latency: `${latency}ms`
};
} catch (error) {
health.services['auth-service'] = {
status: 'unhealthy',
error: error.message
};
health.status = 'degraded';
}
// Admin Serviceのヘルスチェック
try {
const startAdmin = Date.now();
const adminResponse = await axios.get(
`${ADMIN_SERVICE_URL}/health`,
{ timeout: 3000 }
);
const latency = Date.now() - startAdmin;
health.services['admin-service'] = {
status: adminResponse.status === 200 ? 'healthy' : 'unhealthy',
latency: `${latency}ms`
};
} catch (error) {
health.services['admin-service'] = {
status: 'unhealthy',
error: error.message
};
health.status = 'degraded';
}
// 全体のステータス判定
const allHealthy = Object.values(health.services).every(s => s.status === 'healthy');
if (!allHealthy) {
health.status = 'degraded';
}
const statusCode = health.status === 'healthy' ? 200 : 503;
res.status(statusCode).json(health);
});
Readinessプローブ
// Kubernetes Readinessプローブ用
router.get('/ready', async (req, res) => {
try {
// 最小限のチェック(Auth Serviceが応答するか)
await axios.get(`${AUTH_SERVICE_URL}/health`, { timeout: 2000 });
res.status(200).json({
ready: true,
timestamp: new Date().toISOString()
});
} catch (error) {
res.status(503).json({
ready: false,
error: error.message,
timestamp: new Date().toISOString()
});
}
});
Livenessプローブ
// Kubernetes Livenessプローブ用
router.get('/live', (req, res) => {
// プロセスが生きているか(単純チェック)
res.status(200).json({
alive: true,
timestamp: new Date().toISOString(),
uptime: process.uptime()
});
});
📊 Prometheusメトリクス収集
prom-client導入
npm install prom-client
メトリクス定義
// api-gateway/src/metrics/prometheus.js
const promClient = require('prom-client');
// デフォルトメトリクス(CPU、メモリなど)を有効化
promClient.collectDefaultMetrics({ timeout: 5000 });
// カスタムメトリクス定義
// 1. HTTPリクエスト数(Counter)
const httpRequestsTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// 2. HTTPリクエスト時間(Histogram)
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2.5, 5, 10]
});
// 3. アクティブなリクエスト数(Gauge)
const httpRequestsInProgress = new promClient.Gauge({
name: 'http_requests_in_progress',
help: 'Number of HTTP requests currently in progress',
labelNames: ['method', 'route']
});
// 4. ライセンスアクティベーション数(Counter)
const licenseActivations = new promClient.Counter({
name: 'license_activations_total',
help: 'Total number of license activations',
labelNames: ['plan', 'status']
});
// 5. バックエンドサービスエラー(Counter)
const backendErrors = new promClient.Counter({
name: 'backend_errors_total',
help: 'Total number of backend service errors',
labelNames: ['service', 'error_type']
});
module.exports = {
httpRequestsTotal,
httpRequestDuration,
httpRequestsInProgress,
licenseActivations,
backendErrors,
register: promClient.register
};
メトリクスミドルウェア
// api-gateway/src/middleware/metrics.js
const {
httpRequestsTotal,
httpRequestDuration,
httpRequestsInProgress
} = require('../metrics/prometheus');
function metricsMiddleware(req, res, next) {
const start = Date.now();
const route = req.route?.path || req.path;
// リクエスト開始時
httpRequestsInProgress.labels(req.method, route).inc();
// レスポンス終了時
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
// リクエスト数をカウント
httpRequestsTotal.labels(req.method, route, res.statusCode).inc();
// リクエスト時間を記録
httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);
// 進行中カウントを減らす
httpRequestsInProgress.labels(req.method, route).dec();
});
next();
}
module.exports = metricsMiddleware;
メトリクスエンドポイント
// api-gateway/src/routes/metrics.js
const express = require('express');
const router = express.Router();
const { register } = require('../metrics/prometheus');
// Prometheus メトリクスエンドポイント
router.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
module.exports = router;
カスタムメトリクスの記録
// api-gateway/src/routes/auth.js
const { licenseActivations, backendErrors } = require('../metrics/prometheus');
router.post('/license/activate', async (req, res) => {
try {
const response = await axios.post(`${AUTH_SERVICE_URL}/activate`, req.body);
// 成功したアクティベーションを記録
licenseActivations.labels(response.data.plan || 'unknown', 'success').inc();
res.json(response.data);
} catch (error) {
// エラーを記録
backendErrors.labels('auth-service', error.code || 'unknown').inc();
licenseActivations.labels('unknown', 'failure').inc();
handleProxyError(error, res);
}
});
📈 Grafanaダッシュボード
Prometheus設定
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'api-gateway'
static_configs:
- targets: ['api-gateway:3000']
metrics_path: '/metrics'
- job_name: 'auth-service'
static_configs:
- targets: ['auth-service:3001']
metrics_path: '/metrics'
- job_name: 'admin-service'
static_configs:
- targets: ['admin-service:3002']
metrics_path: '/metrics'
Grafanaダッシュボード設定(JSON)
{
"dashboard": {
"title": "Hybrid License System - API Gateway",
"panels": [
{
"title": "Request Rate",
"targets": [
{
"expr": "rate(http_requests_total[5m])"
}
]
},
{
"title": "Request Duration (p95)",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
}
]
},
{
"title": "Error Rate",
"targets": [
{
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])"
}
]
},
{
"title": "Active Requests",
"targets": [
{
"expr": "http_requests_in_progress"
}
]
},
{
"title": "License Activations",
"targets": [
{
"expr": "rate(license_activations_total[5m])"
}
]
}
]
}
}
🚨 アラート設定
Prometheusアラートルール
# alert_rules.yml
groups:
- name: api_gateway_alerts
interval: 30s
rules:
# High Error Rate
- alert: HighErrorRate
expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors/sec"
# High Response Time
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "P95 response time is {{ $value }} seconds"
# Service Down
- alert: ServiceDown
expr: up{job="api-gateway"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "API Gateway is down"
description: "API Gateway has been down for more than 1 minute"
# Backend Service Errors
- alert: BackendServiceErrors
expr: rate(backend_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High backend service error rate"
description: "Backend errors: {{ $value }} errors/sec"
# High Memory Usage
- alert: HighMemoryUsage
expr: process_resident_memory_bytes / 1024 / 1024 > 500
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }} MB"
Alertmanager設定
# alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-receiver'
continue: true
- match:
severity: warning
receiver: 'warning-receiver'
receivers:
- name: 'default-receiver'
webhook_configs:
- url: 'http://slack-webhook/alerts'
- name: 'critical-receiver'
email_configs:
- to: 'ops@example.com'
from: 'alertmanager@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'alertmanager'
auth_password: 'password'
- name: 'warning-receiver'
slack_configs:
- api_url: 'https://hooks.slack.com/services/XXX'
channel: '#alerts'
🔍 ログ集約
構造化ログ
// api-gateway/src/utils/logger.js
const winston = require('winston');
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
defaultMeta: { service: 'api-gateway' },
transports: [
new winston.transports.File({ filename: 'error.log', level: 'error' }),
new winston.transports.File({ filename: 'combined.log' }),
new winston.transports.Console({
format: winston.format.simple()
})
]
});
module.exports = logger;
リクエストログ
// api-gateway/src/middleware/requestLogger.js
const logger = require('../utils/logger');
function requestLogger(req, res, next) {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
logger.info({
type: 'request',
requestId: req.id,
method: req.method,
path: req.path,
statusCode: res.statusCode,
duration: `${duration}ms`,
ip: req.ip,
userAgent: req.get('user-agent'),
userId: req.user?.userId || null
});
});
next();
}
module.exports = requestLogger;
🐳 Docker Composeでの統合
# docker-compose.yml
version: '3.8'
services:
api-gateway:
build: ./api-gateway
ports:
- "3000:3000"
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
interval: 30s
timeout: 10s
retries: 3
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.enable-lifecycle'
grafana:
image: grafana/grafana:latest
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-storage:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
volumes:
grafana-storage:
🎯 次のステップ
Day 10では、API Gatewayの最適化について学びます。キャッシング戦略、コネクションプーリング、パフォーマンスチューニングを実装しましょう。
🔗 関連リンク
次回予告: Day 10では、Redisキャッシュとコネクションプーリングによるパフォーマンス最適化を詳しく解説します!
Copyright © 2025 Gods & Golem, Inc. All rights reserved.