0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Hybrid License System Day 9: ヘルスチェックと監視

Last updated at Posted at 2025-12-08

🎄 科学と神々株式会社 アドベントカレンダー 2025

Hybrid License System Day 9: ヘルスチェックと監視

API Gateway編 (4/5)


📖 はじめに

Day 9では、ヘルスチェックと監視を学びます。ヘルスチェックエンドポイント、サービス監視、Prometheusメトリクス収集、アラート設定を実装しましょう。


🏥 ヘルスチェックエンドポイント

基本的なヘルスチェック

// api-gateway/src/routes/health.js
const express = require('express');
const router = express.Router();

// シンプルなヘルスチェック
router.get('/health', (req, res) => {
  res.status(200).json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime()
  });
});

module.exports = router;

詳細なヘルスチェック

// 依存サービスを含むヘルスチェック
router.get('/health/detailed', async (req, res) => {
  const health = {
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    services: {
      'auth-service': { status: 'unknown', latency: null },
      'admin-service': { status: 'unknown', latency: null }
    },
    memory: process.memoryUsage(),
    cpu: process.cpuUsage()
  };

  // Auth Serviceのヘルスチェック
  try {
    const startAuth = Date.now();
    const authResponse = await axios.get(
      `${AUTH_SERVICE_URL}/health`,
      { timeout: 3000 }
    );
    const latency = Date.now() - startAuth;

    health.services['auth-service'] = {
      status: authResponse.status === 200 ? 'healthy' : 'unhealthy',
      latency: `${latency}ms`
    };
  } catch (error) {
    health.services['auth-service'] = {
      status: 'unhealthy',
      error: error.message
    };
    health.status = 'degraded';
  }

  // Admin Serviceのヘルスチェック
  try {
    const startAdmin = Date.now();
    const adminResponse = await axios.get(
      `${ADMIN_SERVICE_URL}/health`,
      { timeout: 3000 }
    );
    const latency = Date.now() - startAdmin;

    health.services['admin-service'] = {
      status: adminResponse.status === 200 ? 'healthy' : 'unhealthy',
      latency: `${latency}ms`
    };
  } catch (error) {
    health.services['admin-service'] = {
      status: 'unhealthy',
      error: error.message
    };
    health.status = 'degraded';
  }

  // 全体のステータス判定
  const allHealthy = Object.values(health.services).every(s => s.status === 'healthy');
  if (!allHealthy) {
    health.status = 'degraded';
  }

  const statusCode = health.status === 'healthy' ? 200 : 503;
  res.status(statusCode).json(health);
});

Readinessプローブ

// Kubernetes Readinessプローブ用
router.get('/ready', async (req, res) => {
  try {
    // 最小限のチェック(Auth Serviceが応答するか)
    await axios.get(`${AUTH_SERVICE_URL}/health`, { timeout: 2000 });

    res.status(200).json({
      ready: true,
      timestamp: new Date().toISOString()
    });
  } catch (error) {
    res.status(503).json({
      ready: false,
      error: error.message,
      timestamp: new Date().toISOString()
    });
  }
});

Livenessプローブ

// Kubernetes Livenessプローブ用
router.get('/live', (req, res) => {
  // プロセスが生きているか(単純チェック)
  res.status(200).json({
    alive: true,
    timestamp: new Date().toISOString(),
    uptime: process.uptime()
  });
});

📊 Prometheusメトリクス収集

prom-client導入

npm install prom-client

メトリクス定義

// api-gateway/src/metrics/prometheus.js
const promClient = require('prom-client');

// デフォルトメトリクス(CPU、メモリなど)を有効化
promClient.collectDefaultMetrics({ timeout: 5000 });

// カスタムメトリクス定義

// 1. HTTPリクエスト数(Counter)
const httpRequestsTotal = new promClient.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

// 2. HTTPリクエスト時間(Histogram)
const httpRequestDuration = new promClient.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2.5, 5, 10]
});

// 3. アクティブなリクエスト数(Gauge)
const httpRequestsInProgress = new promClient.Gauge({
  name: 'http_requests_in_progress',
  help: 'Number of HTTP requests currently in progress',
  labelNames: ['method', 'route']
});

// 4. ライセンスアクティベーション数(Counter)
const licenseActivations = new promClient.Counter({
  name: 'license_activations_total',
  help: 'Total number of license activations',
  labelNames: ['plan', 'status']
});

// 5. バックエンドサービスエラー(Counter)
const backendErrors = new promClient.Counter({
  name: 'backend_errors_total',
  help: 'Total number of backend service errors',
  labelNames: ['service', 'error_type']
});

module.exports = {
  httpRequestsTotal,
  httpRequestDuration,
  httpRequestsInProgress,
  licenseActivations,
  backendErrors,
  register: promClient.register
};

メトリクスミドルウェア

// api-gateway/src/middleware/metrics.js
const {
  httpRequestsTotal,
  httpRequestDuration,
  httpRequestsInProgress
} = require('../metrics/prometheus');

function metricsMiddleware(req, res, next) {
  const start = Date.now();
  const route = req.route?.path || req.path;

  // リクエスト開始時
  httpRequestsInProgress.labels(req.method, route).inc();

  // レスポンス終了時
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;

    // リクエスト数をカウント
    httpRequestsTotal.labels(req.method, route, res.statusCode).inc();

    // リクエスト時間を記録
    httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);

    // 進行中カウントを減らす
    httpRequestsInProgress.labels(req.method, route).dec();
  });

  next();
}

module.exports = metricsMiddleware;

メトリクスエンドポイント

// api-gateway/src/routes/metrics.js
const express = require('express');
const router = express.Router();
const { register } = require('../metrics/prometheus');

// Prometheus メトリクスエンドポイント
router.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

module.exports = router;

カスタムメトリクスの記録

// api-gateway/src/routes/auth.js
const { licenseActivations, backendErrors } = require('../metrics/prometheus');

router.post('/license/activate', async (req, res) => {
  try {
    const response = await axios.post(`${AUTH_SERVICE_URL}/activate`, req.body);

    // 成功したアクティベーションを記録
    licenseActivations.labels(response.data.plan || 'unknown', 'success').inc();

    res.json(response.data);
  } catch (error) {
    // エラーを記録
    backendErrors.labels('auth-service', error.code || 'unknown').inc();
    licenseActivations.labels('unknown', 'failure').inc();

    handleProxyError(error, res);
  }
});

📈 Grafanaダッシュボード

Prometheus設定

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'api-gateway'
    static_configs:
      - targets: ['api-gateway:3000']
    metrics_path: '/metrics'

  - job_name: 'auth-service'
    static_configs:
      - targets: ['auth-service:3001']
    metrics_path: '/metrics'

  - job_name: 'admin-service'
    static_configs:
      - targets: ['admin-service:3002']
    metrics_path: '/metrics'

Grafanaダッシュボード設定(JSON)

{
  "dashboard": {
    "title": "Hybrid License System - API Gateway",
    "panels": [
      {
        "title": "Request Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])"
          }
        ]
      },
      {
        "title": "Request Duration (p95)",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
          }
        ]
      },
      {
        "title": "Error Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])"
          }
        ]
      },
      {
        "title": "Active Requests",
        "targets": [
          {
            "expr": "http_requests_in_progress"
          }
        ]
      },
      {
        "title": "License Activations",
        "targets": [
          {
            "expr": "rate(license_activations_total[5m])"
          }
        ]
      }
    ]
  }
}

🚨 アラート設定

Prometheusアラートルール

# alert_rules.yml
groups:
  - name: api_gateway_alerts
    interval: 30s
    rules:
      # High Error Rate
      - alert: HighErrorRate
        expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }} errors/sec"

      # High Response Time
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time detected"
          description: "P95 response time is {{ $value }} seconds"

      # Service Down
      - alert: ServiceDown
        expr: up{job="api-gateway"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "API Gateway is down"
          description: "API Gateway has been down for more than 1 minute"

      # Backend Service Errors
      - alert: BackendServiceErrors
        expr: rate(backend_errors_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High backend service error rate"
          description: "Backend errors: {{ $value }} errors/sec"

      # High Memory Usage
      - alert: HighMemoryUsage
        expr: process_resident_memory_bytes / 1024 / 1024 > 500
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Memory usage is {{ $value }} MB"

Alertmanager設定

# alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default-receiver'

  routes:
    - match:
        severity: critical
      receiver: 'critical-receiver'
      continue: true

    - match:
        severity: warning
      receiver: 'warning-receiver'

receivers:
  - name: 'default-receiver'
    webhook_configs:
      - url: 'http://slack-webhook/alerts'

  - name: 'critical-receiver'
    email_configs:
      - to: 'ops@example.com'
        from: 'alertmanager@example.com'
        smarthost: 'smtp.example.com:587'
        auth_username: 'alertmanager'
        auth_password: 'password'

  - name: 'warning-receiver'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/XXX'
        channel: '#alerts'

🔍 ログ集約

構造化ログ

// api-gateway/src/utils/logger.js
const winston = require('winston');

const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.json()
  ),
  defaultMeta: { service: 'api-gateway' },
  transports: [
    new winston.transports.File({ filename: 'error.log', level: 'error' }),
    new winston.transports.File({ filename: 'combined.log' }),
    new winston.transports.Console({
      format: winston.format.simple()
    })
  ]
});

module.exports = logger;

リクエストログ

// api-gateway/src/middleware/requestLogger.js
const logger = require('../utils/logger');

function requestLogger(req, res, next) {
  const start = Date.now();

  res.on('finish', () => {
    const duration = Date.now() - start;

    logger.info({
      type: 'request',
      requestId: req.id,
      method: req.method,
      path: req.path,
      statusCode: res.statusCode,
      duration: `${duration}ms`,
      ip: req.ip,
      userAgent: req.get('user-agent'),
      userId: req.user?.userId || null
    });
  });

  next();
}

module.exports = requestLogger;

🐳 Docker Composeでの統合

# docker-compose.yml
version: '3.8'

services:
  api-gateway:
    build: ./api-gateway
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--web.enable-lifecycle'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-storage:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'

volumes:
  grafana-storage:

🎯 次のステップ

Day 10では、API Gatewayの最適化について学びます。キャッシング戦略、コネクションプーリング、パフォーマンスチューニングを実装しましょう。


🔗 関連リンク


次回予告: Day 10では、Redisキャッシュとコネクションプーリングによるパフォーマンス最適化を詳しく解説します!


Copyright © 2025 Gods & Golem, Inc. All rights reserved.

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?