17
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

TerraformでMonitoring As Code ~メトリクスアラーム設定編(slack・電話通知)~

Last updated at Posted at 2019-08-12

はじめに

こんにちわ。Wano株式会社エンジニアのnariと申します。

最近、TerraformやLambda(Go)などを使ったMonitoring As Codeの話を記事にすることが多くなってきました。

今までは、リソースのステータスチェックシステム周りの話が多かったのですが、今回はメトリクスアラーム設定周りのIaCに関して記事にしていこうかなと思います。

過去のMonitoring As Code系記事

システム全体像

スクリーンショット 2019-08-12 22.21.19.png

何故Monitoring As Codeしたいか

  • 同じようなアラーム設定を手動で繰り返しやりたくない
  • 異なる環境で設定を使いまわしたい

Terraformのソース

module(metrics alarm)

  • 一つのSNS Topicが複数アラートと、複数通知先(slack,電話etc)を持てるように設計した
  • 今回の場合は、通知先の前に必ずlambdaが挟まる想定で作っている
main.tf
//登録通知先が、全てlambdaの場合のみ使用可能なmodule
/* CloudWatch */
resource "aws_cloudwatch_metric_alarm" "this" {
  count = length(var.cwa_objs)
  /* required */
  alarm_actions       = [aws_sns_topic.this.arn]
  ok_actions          = [aws_sns_topic.this.arn]
  alarm_name          = var.cwa_objs[count.index]["alarm_name"]
  comparison_operator = var.cwa_objs[count.index]["comparison_operator"]
  evaluation_periods  = var.cwa_objs[count.index]["evaluation_periods"]
  threshold           = var.cwa_objs[count.index]["threshold"]
  metric_name         = var.cwa_objs[count.index]["metric_name"]
  namespace           = var.cwa_objs[count.index]["namespace"]
  period              = var.cwa_objs[count.index]["period"]
  statistic           = var.cwa_objs[count.index]["statistic"]
  alarm_description   = var.cwa_objs[count.index]["alarm_description"]
  dimensions          = var.cwa_objs[count.index]["dimensions"]

  /* optional */
  tags = var.cwa_objs[count.index]["cw_tags"]
}


/* SNS */
resource "aws_sns_topic" "this" {
  /* required */
  name = var.sns_name
  /* optional */
  display_name = var.display_name
  policy       = var.policy
  tags         = var.sns_tags
}

resource "aws_sns_topic_subscription" "notify_to_slack" {
  count = length(var.sns_topic_subscription_objs)
  /* required */
  topic_arn = aws_sns_topic.this.arn
  endpoint  = var.sns_topic_subscription_objs[count.index]["endpoint"]

  /* optional */
  raw_message_delivery = var.sns_topic_subscription_objs[count.index]["raw_message_delivery"]
  protocol             = "lambda"
}

/* Lambda permision */
resource "aws_lambda_permission" "default" {
  count         = length(var.sns_topic_subscription_objs)
  statement_id  = var.sns_topic_subscription_objs[count.index]["statement_id"]
  action        = "lambda:InvokeFunction"
  function_name = var.sns_topic_subscription_objs[count.index]["endpoint"]
  principal     = "sns.amazonaws.com"
  source_arn    = aws_sns_topic.this.arn
}


variables.tf
/* CloudWatch */
/* required */
variable "cwa_objs" {
  description = "通知したいアラームをここに詰める"
  type = list(object({
    alarm_name          = string
    comparison_operator = string
    evaluation_periods  = number
    threshold           = number
    alarm_description   = string
    metric_name         = string
    namespace           = string
    period              = string
    statistic           = string
    dimensions          = map(string)
    cw_tags             = map(string)
  }))
}


/* SNS */
/* required */
variable "sns_name" {
  description = "The name of the SNS topic to create"
  type        = string
}

variable "sns_topic_subscription_objs" {
  description = "登録したい通知先内容をここに詰める(lambda限定)"
  type = list(object({
    endpoint             = string
    raw_message_delivery = bool
    statement_id         = string
    })
  )
}

/* optional */
variable "create_sns_topic" {
  description = "Whether to create the SNS topic"
  type        = bool
  default     = true
}

variable "display_name" {
  description = "The display name for the SNS topic"
  type        = string
  default     = null
}

variable "policy" {
  description = "The fully-formed AWS policy as JSON"
  type        = string
  default     = null
}

variable "sns_tags" {
  description = "A mapping of tags to assign to all resources"
  type        = map(string)
  default     = {}
}

moduleを使う側

  • moduleを利用して、アラートと通知先の配列にひたすら設定したものを詰める
main.tf
/* metrics_alarm_to_slack  */
module "laboon_metrics_alarm_to_slack" {
  source   = "../../../../modules/common/metric_alarm"
  cwa_objs = local.metrics_alarm_to_slack_objs

  sns_name = "laboon_stage_metrics_alert_to_slack"
  sns_topic_subscription_objs = [
    //この配列に通知先lambdaを追加する
    {
      endpoint             = var.apex_function_laboon_stage_metrics_alarm_to_slack
      raw_message_delivery = false
      statement_id         = "laboon_stage_metrics_alert_to_slack"
    }
  ]
}

/* metrics_alarm_to_call  */
module "laboon_metrics_alarm_to_call" {
  source   = "../../../../modules/common/metric_alarm"
  cwa_objs = local.metrics_alarm_to_call_objs
  sns_name = "hoge_metrics_alert_to_call"
  sns_topic_subscription_objs = [
    //この配列に通知先lambdaを追加する
    {
      endpoint             = var.apex_function_hoge_metrics_alarm_to_call
      raw_message_delivery = false
      statement_id         = "hoge_metrics_alert_to_call"
    },
    {
      endpoint             = var.apex_function_hoge_metrics_alarm_to_slack
      raw_message_delivery = false
      statement_id         = "hoge_metrics_alert_to_slack_with_call"
    }
  ]
}

  /* metrics_alarm_to_slack required*/
  metrics_alarm_to_slack_objs = [
    //この配列にslackに通知したいメトリクス(not critical)を追加する
    //LAMBDA
    {
      alarm_name          = "start_execution(stepfunctions)lambdaの死"
      comparison_operator = "GreaterThanThreshold"
      evaluation_periods  = "1"
      metric_name         = "Errors"
      namespace           = "AWS/Lambda"
      period              = "300"
      statistic           = "Average"
      threshold           = "0.0"
      alarm_description   = "start_execution(stepfunctions)lambdaの死"
      dimensions = {
        FunctionName = "hoge_start_execution"
        Resource     = "hogee_start_execution:current"
      }
      cw_tags = {}
    },
    //ECS/ContainerInsights
    {
      alarm_name          = "CpuUtilized-for-hoge-ecs-task"
      comparison_operator = "GreaterThanThreshold"
      evaluation_periods  = "2"
      metric_name         = "CpuUtilized"
      namespace           = "ECS/ContainerInsights"
      period              = "60"
      statistic           = "Average"
      threshold           = "60"
      alarm_description   = "CpuUtilized-for-hoge-ecs-task"
      dimensions = {
        TaskDefinitionFamily = "hoge-ad-fluentd-ecs"
        ClusterName          = "hoge-ad-cluster"
      }
      cw_tags = {}
    },
     ...

  /* metrics_alarm_to_call required*/
  metrics_alarm_to_call_objs = [
    //この配列に電話に通知したいメトリクス(critical)を追加する
    //ELB
    {
      alarm_name          = "hoge-ad500Error>0"
      comparison_operator = "GreaterThanThreshold"
      evaluation_periods  = "1"
      metric_name         = "HTTPCode_Target_5XX_Count"
      namespace           = "AWS/ApplicationELB"
      period              = "300"
      statistic           = "Average"
      threshold           = "0.0"
      alarm_description   = "hoge-ad500Error>0"
      dimensions = {
        TargetGroup  = "targetgroup/hoge-lb-target-group/xxxxxxxxxxxxxxxxxxx"
        LoadBalancer = "app/hoge-aws-lb/xxxxxxxxxxxxxxxxxxx"
      }
      cw_tags = {}
    },
    ...

Lambda(Go)のソース

slackに通知

  • アラートの場合とOKの場合で内容を変える
main.go
package main

import (
	"context"
	"encoding/json"
	"fmt"
	"github.com/aws/aws-lambda-go/events"
	"github.com/aws/aws-lambda-go/lambda"
	"os"
    . . .
)

type SNSMessage struct {
	AlarmName      string
	NewStateValue  string
	NewStateReason string
}

func main() {
	lambda.Start(metricsAlarmToSlackHandler)
}

func metricsAlarmToSlackHandler(context context.Context, event events.SNSEvent) (e error) {
	webhookURL := os.Getenv("webHookUrl")
	channel := os.Getenv("slackChannel")

	snsMessage := &SNSMessage{}
	err := json.Unmarshal([]byte(event.Records[0].SNS.Message), snsMessage)
	if err != nil {
		log.Error(err)
		return err
	}
	if snsMessage.NewStateValue == "ALARM" {
		title := snsMessage.AlarmName
		slackIcon := ":sweat:"
		slackName := "hogeのモニタリング:warning"
		cwURL := fmt.Sprintf("https://ap-northeast-1.console.aws.amazon.com/cloudwatch/home?region=ap-northeast-1#alarmsV2:alarm/%v", title)
		text := fmt.Sprintf(":cloudwatch: *WARNING* :cloudwatch: \n アラーム:%v\n 理由:%v \n url:%v", title, snsMessage.NewStateReason, cwURL)
		messageLevel := slack_reporter.SLACK_MESSAGE_LEVEL_ALART
		err = slack_reporter.ReportToSlack(webhookURL, channel, slackName, slackIcon, title, text, messageLevel)
		if err != nil {
			log.Error(err)
			return err
		}
		return nil
	}

	title := snsMessage.AlarmName
	slackIcon := ":nick:"
	slackName := "hogeのモニタリング:ok"
	cwURL := fmt.Sprintf("https://ap-northeast-1.console.aws.amazon.com/cloudwatch/home?region=ap-northeast-1#alarmsV2:alarm/%v", title)
	text := fmt.Sprintf(":cloudwatch: *OK* :cloudwatch: \n アラーム:%v\n 理由:%v \n url:%v", title, snsMessage.NewStateReason, cwURL)
	messageLevel := slack_reporter.SLACK_MESSAGE_LEVEL_OK
	err = slack_reporter.ReportToSlack(webhookURL, channel, slackName, slackIcon, title, text, messageLevel)
	if err != nil {
		log.Error(err)
		return err
	}
	return nil
}

電話で通知(amazon connect)

  • Amazon Connectの設定はこちらがわかりやすい こちらの設定はTerraform対応がないので、手動でやる
  • アラートの場合のみ電話をかける
main.go
package main

import (
	"context"
	"encoding/json"
	"fmt"
	"github.com/aws/aws-lambda-go/events"
	"github.com/aws/aws-lambda-go/lambda"
	"github.com/aws/aws-sdk-go/aws"
	"github.com/aws/aws-sdk-go/aws/session"
	"github.com/aws/aws-sdk-go/service/connect"
    ...
)

type SNSMessage struct {
	AlarmName      string
	NewStateValue  string
	NewStateReason string
}

func main() {
	lambda.Start(metricsAlarmToCallHandler)
}

func metricsAlarmToCallHandler(context context.Context, event events.SNSEvent) (e error) {
	snsMessage := &SNSMessage{}
	err := json.Unmarshal([]byte(event.Records[0].SNS.Message), snsMessage)
	if err != nil {
		log.Error(err)
		return err
	}

	if snsMessage.NewStateValue != "ALARM" {
		return nil
	}

	cnt := connect.New(session.New(), &aws.Config{Region: aws.String("ap-northeast-1")})
	destinationPhoneNumber := "+81xxxxxxxxxxxx"
	contactFlowID := "2877axxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
	instanceID := "29f67xxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
	sourcePhoneNumber := "+81xxxxxxxxxx"
	attributes := map[string]*string{}
	message := fmt.Sprintf("hoge 本番環境にて障害が発生しました。直ちに対応してください。エラー内容は、%v", snsMessage.AlarmName)
	attributes["message"] = &message

	input := &connect.StartOutboundVoiceContactInput{
		DestinationPhoneNumber: &destinationPhoneNumber,
		ContactFlowId:          &contactFlowID,
		InstanceId:             &instanceID,
		SourcePhoneNumber:      &sourcePhoneNumber,
		Attributes:             attributes,
	}
	_, err = cnt.StartOutboundVoiceContact(input)
	if err != nil {
		log.Error(err)
		return err
	}
	return nil
}

終わりに

今回はメトリクスアラーム周りのIaCの話を記事にしました。

最近ここら辺のタスクを業務で扱うことが出てきたので、そもそものクリティカルとそうでないものの区別や、リソースメトリクスのアラーム設定是非などに関してまだまだ考えがまとまっていないため、今回はとりあえず要件を満たす形で実装しました。 (ちょっとチェックボックス監視みがある。。)

これから運用していく中で、入門監視などから得た観点を整理して、リファクタしていこうと思っています(ここら辺もまた記事にします)

また、電話通知に関しては、エスカレーションや電話に出れなかった場合のリトライの実装などができていないため、対応ができ次第そこらへんも記事にしていこうかなと思います。

それにしても監視は面白い。これからもここら辺のタスクはガンガンやっていこうと思います(ビジネスKPI,SLOの設定絡みのとこもどんどんやりたい)

17
11
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
17
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?