科学と神々株式会社 Advent Calendar 2025

Hybrid License System Day 9: ヘルスチェックと監視

Last updated at 2025-12-08Posted at 2025-12-08

🎄 科学と神々株式会社アドベントカレンダー 2025

Hybrid License System Day 9: ヘルスチェックと監視

API Gateway編 (4/5)

📖 はじめに

Day 9では、ヘルスチェックと監視を学びます。ヘルスチェックエンドポイント、サービス監視、Prometheusメトリクス収集、アラート設定を実装しましょう。

🏥 ヘルスチェックエンドポイント

基本的なヘルスチェック

// api-gateway/src/routes/health.js
const express = require('express');
const router = express.Router();

// シンプルなヘルスチェック
router.get('/health', (req, res) => {
  res.status(200).json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime()
  });
});

module.exports = router;

詳細なヘルスチェック

// 依存サービスを含むヘルスチェック
router.get('/health/detailed', async (req, res) => {
  const health = {
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    services: {
      'auth-service': { status: 'unknown', latency: null },
      'admin-service': { status: 'unknown', latency: null }
    },
    memory: process.memoryUsage(),
    cpu: process.cpuUsage()
  };

  // Auth Serviceのヘルスチェック
  try {
    const startAuth = Date.now();
    const authResponse = await axios.get(
      `${AUTH_SERVICE_URL}/health`,
      { timeout: 3000 }
    );
    const latency = Date.now() - startAuth;

    health.services['auth-service'] = {
      status: authResponse.status === 200 ? 'healthy' : 'unhealthy',
      latency: `${latency}ms`
    };
  } catch (error) {
    health.services['auth-service'] = {
      status: 'unhealthy',
      error: error.message
    };
    health.status = 'degraded';
  }

  // Admin Serviceのヘルスチェック
  try {
    const startAdmin = Date.now();
    const adminResponse = await axios.get(
      `${ADMIN_SERVICE_URL}/health`,
      { timeout: 3000 }
    );
    const latency = Date.now() - startAdmin;

    health.services['admin-service'] = {
      status: adminResponse.status === 200 ? 'healthy' : 'unhealthy',
      latency: `${latency}ms`
    };
  } catch (error) {
    health.services['admin-service'] = {
      status: 'unhealthy',
      error: error.message
    };
    health.status = 'degraded';
  }

  // 全体のステータス判定
  const allHealthy = Object.values(health.services).every(s => s.status === 'healthy');
  if (!allHealthy) {
    health.status = 'degraded';
  }

  const statusCode = health.status === 'healthy' ? 200 : 503;
  res.status(statusCode).json(health);
});

Readinessプローブ

// Kubernetes Readinessプローブ用
router.get('/ready', async (req, res) => {
  try {
    // 最小限のチェック（Auth Serviceが応答するか）
    await axios.get(`${AUTH_SERVICE_URL}/health`, { timeout: 2000 });

    res.status(200).json({
      ready: true,
      timestamp: new Date().toISOString()
    });
  } catch (error) {
    res.status(503).json({
      ready: false,
      error: error.message,
      timestamp: new Date().toISOString()
    });
  }
});

Livenessプローブ

// Kubernetes Livenessプローブ用
router.get('/live', (req, res) => {
  // プロセスが生きているか（単純チェック）
  res.status(200).json({
    alive: true,
    timestamp: new Date().toISOString(),
    uptime: process.uptime()
  });
});

📊 Prometheusメトリクス収集

prom-client導入

npm install prom-client

メトリクス定義

// api-gateway/src/metrics/prometheus.js
const promClient = require('prom-client');

// デフォルトメトリクス（CPU、メモリなど）を有効化
promClient.collectDefaultMetrics({ timeout: 5000 });

// カスタムメトリクス定義

// 1. HTTPリクエスト数（Counter）
const httpRequestsTotal = new promClient.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

// 2. HTTPリクエスト時間（Histogram）
const httpRequestDuration = new promClient.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2.5, 5, 10]
});

// 3. アクティブなリクエスト数（Gauge）
const httpRequestsInProgress = new promClient.Gauge({
  name: 'http_requests_in_progress',
  help: 'Number of HTTP requests currently in progress',
  labelNames: ['method', 'route']
});

// 4. ライセンスアクティベーション数（Counter）
const licenseActivations = new promClient.Counter({
  name: 'license_activations_total',
  help: 'Total number of license activations',
  labelNames: ['plan', 'status']
});

// 5. バックエンドサービスエラー（Counter）
const backendErrors = new promClient.Counter({
  name: 'backend_errors_total',
  help: 'Total number of backend service errors',
  labelNames: ['service', 'error_type']
});

module.exports = {
  httpRequestsTotal,
  httpRequestDuration,
  httpRequestsInProgress,
  licenseActivations,
  backendErrors,
  register: promClient.register
};

メトリクスミドルウェア

// api-gateway/src/middleware/metrics.js
const {
  httpRequestsTotal,
  httpRequestDuration,
  httpRequestsInProgress
} = require('../metrics/prometheus');

function metricsMiddleware(req, res, next) {
  const start = Date.now();
  const route = req.route?.path || req.path;

  // リクエスト開始時
  httpRequestsInProgress.labels(req.method, route).inc();

  // レスポンス終了時
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;

    // リクエスト数をカウント
    httpRequestsTotal.labels(req.method, route, res.statusCode).inc();

    // リクエスト時間を記録
    httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);

    // 進行中カウントを減らす
    httpRequestsInProgress.labels(req.method, route).dec();
  });

  next();
}

module.exports = metricsMiddleware;

メトリクスエンドポイント

// api-gateway/src/routes/metrics.js
const express = require('express');
const router = express.Router();
const { register } = require('../metrics/prometheus');

// Prometheus メトリクスエンドポイント
router.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

module.exports = router;

カスタムメトリクスの記録

// api-gateway/src/routes/auth.js
const { licenseActivations, backendErrors } = require('../metrics/prometheus');

router.post('/license/activate', async (req, res) => {
  try {
    const response = await axios.post(`${AUTH_SERVICE_URL}/activate`, req.body);

    // 成功したアクティベーションを記録
    licenseActivations.labels(response.data.plan || 'unknown', 'success').inc();

    res.json(response.data);
  } catch (error) {
    // エラーを記録
    backendErrors.labels('auth-service', error.code || 'unknown').inc();
    licenseActivations.labels('unknown', 'failure').inc();

    handleProxyError(error, res);
  }
});

📈 Grafanaダッシュボード

Prometheus設定

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'api-gateway'
    static_configs:
      - targets: ['api-gateway:3000']
    metrics_path: '/metrics'

  - job_name: 'auth-service'
    static_configs:
      - targets: ['auth-service:3001']
    metrics_path: '/metrics'

  - job_name: 'admin-service'
    static_configs:
      - targets: ['admin-service:3002']
    metrics_path: '/metrics'

Grafanaダッシュボード設定（JSON）

{
  "dashboard": {
    "title": "Hybrid License System - API Gateway",
    "panels": [
      {
        "title": "Request Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])"
          }
        ]
      },
      {
        "title": "Request Duration (p95)",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
          }
        ]
      },
      {
        "title": "Error Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])"
          }
        ]
      },
      {
        "title": "Active Requests",
        "targets": [
          {
            "expr": "http_requests_in_progress"
          }
        ]
      },
      {
        "title": "License Activations",
        "targets": [
          {
            "expr": "rate(license_activations_total[5m])"
          }
        ]
      }
    ]
  }
}

🚨 アラート設定

Prometheusアラートルール

# alert_rules.yml
groups:
  - name: api_gateway_alerts
    interval: 30s
    rules:
      # High Error Rate
      - alert: HighErrorRate
        expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }} errors/sec"

      # High Response Time
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time detected"
          description: "P95 response time is {{ $value }} seconds"

      # Service Down
      - alert: ServiceDown
        expr: up{job="api-gateway"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "API Gateway is down"
          description: "API Gateway has been down for more than 1 minute"

      # Backend Service Errors
      - alert: BackendServiceErrors
        expr: rate(backend_errors_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High backend service error rate"
          description: "Backend errors: {{ $value }} errors/sec"

      # High Memory Usage
      - alert: HighMemoryUsage
        expr: process_resident_memory_bytes / 1024 / 1024 > 500
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value }} MB"

Alertmanager設定

# alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default-receiver'

  routes:
    - match:
        severity: critical
      receiver: 'critical-receiver'
      continue: true

    - match:
        severity: warning
      receiver: 'warning-receiver'

receivers:
  - name: 'default-receiver'
    webhook_configs:
      - url: 'http://slack-webhook/alerts'

  - name: 'critical-receiver'
    email_configs:
      - to: 'ops@example.com'
        from: 'alertmanager@example.com'
        smarthost: 'smtp.example.com:587'
        auth_username: 'alertmanager'
        auth_password: 'password'

  - name: 'warning-receiver'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/XXX'
        channel: '#alerts'

🔍 ログ集約

構造化ログ

// api-gateway/src/utils/logger.js
const winston = require('winston');

const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.json()
  ),
  defaultMeta: { service: 'api-gateway' },
  transports: [
    new winston.transports.File({ filename: 'error.log', level: 'error' }),
    new winston.transports.File({ filename: 'combined.log' }),
    new winston.transports.Console({
      format: winston.format.simple()
    })
  ]
});

module.exports = logger;

リクエストログ

// api-gateway/src/middleware/requestLogger.js
const logger = require('../utils/logger');

function requestLogger(req, res, next) {
  const start = Date.now();

  res.on('finish', () => {
    const duration = Date.now() - start;

    logger.info({
      type: 'request',
      requestId: req.id,
      method: req.method,
      path: req.path,
      statusCode: res.statusCode,
      duration: `${duration}ms`,
      ip: req.ip,
      userAgent: req.get('user-agent'),
      userId: req.user?.userId || null
    });
  });

  next();
}

module.exports = requestLogger;

🐳 Docker Composeでの統合

# docker-compose.yml
version: '3.8'

services:
  api-gateway:
    build: ./api-gateway
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--web.enable-lifecycle'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-storage:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'

volumes:
  grafana-storage:

🎯 次のステップ

Day 10では、API Gatewayの最適化について学びます。キャッシング戦略、コネクションプーリング、パフォーマンスチューニングを実装しましょう。

🔗 関連リンク

次回予告: Day 10では、Redisキャッシュとコネクションプーリングによるパフォーマンス最適化を詳しく解説します！

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up