はじめに
こんにちわ。Wano株式会社エンジニアのnariと申します。
Terraform記事第三弾ということで、今回はAWS上のPipelineのモニタリング機構に関して記事にしたいと思います。
何を作ったか
- AWS CodePipeline,ECS,CodeBuildのステータスが変化すると、slackに投げてくれる仕組みをApex、Terraform)GoでIaC化して作った
全体構成
- CodePipeline/Codebuild/ECSのステータスをCloudWatchで監視
- Statusの変更があった場合に、Lambdaを起動
- Lambda関数でステータスを、Slackへ通知
何故作ったか
- Codepipelineの進捗や結果を確認しに行くのにコンソールまで行くのが面倒くさい
- buildやdeploy失敗を見逃さないアラートの仕組みが欲しかった
何故Apexなのか
- モニタリングリソースが増えるたびにCloudWatchEventsを手動で設定するのが面倒くさい
- かといって、lambda関連のリソースをterraformだけで管理するのは辛い(バイナリ化してzip、バージョン管理etc)
- Apexならapex infraというterraformコマンドをラップした便利コマンドを使用できる(関数名などをvariableに設定するだけでよしなに補填してくれる)
開発環境
Apex 1.0.0-rc2
Terraform 0.12.0
Provider(aws) 2.12.0
Go 1.12.4
ディレクトリ構成
monitoring/
├ functions/
| └ pipeline_notice/
| ├ function.json
| └ main.go
├ infrastructure/
| ├ main.tf
| ├ outputs.tf
| ├ variables.tf
| └ event_pattern/
| ├ codepipeline.json
| ├ codebuild.json
| └ ecs.json
└ project.json
それぞれのコードのご紹介
Apex
- project.json にproject全体の設定
project.json
{
"name": "hogehoge",
"description": "hogehoge",
"nameTemplate": "{{.Project.Name}}_{{.Function.Name}}"
}
- functions.jsonにそれぞれの関数の設定を
function.json
{
"name": "hogehoge",
"description": "hogehoge",
"runtime": "go1.x",
"role": "arn:aws:iam::xxxxxxxxxxxxx:role/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"environment": {
"webHookUrl": "https://hooks.slack.com/services/xxxxxxx/xxxxxxxxx/xxxxxxxxxxxxxxx",
"slackChannel": "xxxxxxxxxxxxxxxxx"
},
"memory": 1024,
"timeout": 120,
"handler": "main",
"hooks": {
"build": "go get -v -t -d ./... && GOOS=linux GOARCH=amd64 go build -o main main.go",
"clean": "rm -f main"
}
}
Go(lambda function)
main.go
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
"os"
"strings"
...
)
type CodePipelineStatus struct {
Pipeline string `json:"pipeline"`
ExecutionID string `json:"execution-id"`
State string `json:"state"`
Version int `json:"version"`
}
type CodeBuildStatus struct {
BuildStatus string `json:"build-status"`
ProjectName string `json:"project-name"`
BuildID string `json:"build-id"`
}
type EcsStatus struct {
ClusterArn string `json:"clusterArn"`
TaskArn string `json:"taskArn"`
TaskDefinitionArn string `json:"taskDefinitionArn"`
DesiredStatus string `json:"desiredStatus"`
LastStatus string `json:"lastStatus"`
}
type CODEPIPELINE_STATE string
const (
CODEPIPELINE_STARTED_STATE CODEPIPELINE_STATE = "STARTED"
CODEPIPELINE_SUCCEEDED_STATE CODEPIPELINE_STATE = "SUCCEEDED"
CODEPIPELINE_RESUMED_STATE CODEPIPELINE_STATE = "RESUMED"
CODEPIPELINE_FAILED_STATE CODEPIPELINE_STATE = "FAILED"
CODEPIPELINE_CANCELED_STATE CODEPIPELINE_STATE = "CANCELED"
)
type CODEBUILD_STATE string
const (
CODEBUILD_STOPPED_STATE CODEBUILD_STATE = "STOPPED"
CODEBUILD_SUCCEEDED_STATE CODEBUILD_STATE = "SUCCEEDED"
CODEBUILD_IN_PROGRESS_STATE CODEBUILD_STATE = "IN_PROGRESS"
CODEBUILD_FAILED_STATE CODEBUILD_STATE = "FAILED"
)
type ECS_STATE string
const (
ECS_RUNNING_STATE ECS_STATE = "RUNNING"
ECS_STOPPED_STATE ECS_STATE = "STOPPED"
)
const (
SLACK_ICON = ":crocus:"
SLACK_NAME = "stagingのdeploypipelineを監視するクロッカス"
)
func main() {
lambda.Start(noticeHandler)
}
func noticeHandler(context context.Context, event events.CloudWatchEvent) (e error) {
webhookURL := os.Getenv("webHookUrl")
channel := os.Getenv("slackChannel")
switch event.Source {
case "aws.codepipeline":
if err := notifyCodePipelineStatus(event, webhookURL, channel); err != nil {
log.Error(err)
return err
}
return nil
case "aws.codebuild":
if err := notifyCodeBuildStatus(event, webhookURL, channel); err != nil {
log.Error(err)
return err
}
return nil
case "aws.ecs":
if err := notifyEcsStatus(event, webhookURL, channel); err != nil {
log.Error(err)
return err
}
return nil
default:
log.Info("想定するリソースのイベントではない")
return nil
}
}
func notifyCodePipelineStatus(event events.CloudWatchEvent, webhookURL string, channel string) (e error) {
status := &CodePipelineStatus{}
err := json.Unmarshal([]byte(event.Detail), status)
if err != nil {
log.Error(err)
return err
}
pipelineURL := fmt.Sprintf("https://%v.console.aws.amazon.com/codesuite/codepipeline/pipelines/%v/executions/%v",
event.Region, status.Pipeline, status.ExecutionID)
text := fmt.Sprintf("*execution-id : %v*\n `state:%v` \n 詳細は %v", status.ExecutionID, status.State, pipelineURL)
var messsageLevel slack_reporter.SLACK_MESSAGE_LEVEL
var title string
switch CODEPIPELINE_STATE(status.State) {
case CODEPIPELINE_STARTED_STATE:
title = fmt.Sprintf("*%vのpipeline開始*", status.Pipeline)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
case CODEPIPELINE_CANCELED_STATE, CODEPIPELINE_RESUMED_STATE:
title = fmt.Sprintf("*%vが%v*", status.Pipeline, status.State)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
case CODEPIPELINE_FAILED_STATE:
title = fmt.Sprintf("*%vのpipeline失敗。。。*", status.Pipeline)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_ALART
case CODEPIPELINE_SUCCEEDED_STATE:
title = fmt.Sprintf("*%vのpipeline成功!!*", status.Pipeline)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_OK
default:
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
}
err = slack_reporter.ReportToLaboonSlack(webhookURL, channel, SLACK_MONITORING_PIPELINE_NAME, SLACK_MONITORING_PIPELINE_ICON, title, text, messsageLevel)
if err != nil {
log.Error(err)
return err
}
return nil
}
func notifyCodeBuildStatus(event events.CloudWatchEvent, webhookURL string, channel string) (e error) {
status := &CodeBuildStatus{}
err := json.Unmarshal([]byte(event.Detail), status)
if err != nil {
log.Error(err)
return err
}
buildID := strings.Split(status.BuildID, "/")[1]
//test-ciのbuildは無視
if status.ProjectName == "hoge-auto-test" {
return nil
}
codebuildURL := fmt.Sprintf("https://%v.console.aws.amazon.com/codebuild/home?%v#/builds/%v/view/new",
event.Region, event.Region, buildID)
text := fmt.Sprintf("*build-id : %v*\n `state:%v` \n 詳細は %v", buildID, status.BuildStatus, codebuildURL)
var title string
var messsageLevel slack_reporter.SLACK_MESSAGE_LEVEL
switch CODEBUILD_STATE(status.BuildStatus) {
case CODEBUILD_IN_PROGRESS_STATE:
title = fmt.Sprintf("*%vがbuild開始*", status.ProjectName)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
case CODEBUILD_FAILED_STATE, CODEBUILD_STOPPED_STATE:
title = fmt.Sprintf("*%vがbuild失敗か停止した。。。*", status.ProjectName)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_ALART
case CODEBUILD_SUCCEEDED_STATE:
title = fmt.Sprintf("*%vがbuild成功*", status.ProjectName)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_OK
default:
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
}
err = slack_reporter.ReportToLaboonSlack(webhookURL, channel, SLACK_MONITORING_PIPELINE_NAME, SLACK_MONITORING_PIPELINE_ICON, title, text, messsageLevel)
if err != nil {
log.Error(err)
return err
}
return nil
}
func notifyEcsStatus(event events.CloudWatchEvent, webhookURL string, channel string) (e error) {
status := &EcsStatus{}
err := json.Unmarshal([]byte(event.Detail), status)
if err != nil {
log.Error(err)
return err
}
// 期待するステートになるまでの過程は通知しない(予期せぬSTOPPEDを除いて)
if status.LastStatus != status.DesiredStatus && status.LastStatus != string(ECS_STOPPED_STATE) {
return nil
}
clusterID := strings.Split(status.ClusterArn, "/")[1]
taskID := strings.Split(status.TaskArn, "/")[1]
taskDefinitionID := strings.Split(status.TaskDefinitionArn, "/")[1]
taskURL := fmt.Sprintf("https://%v.console.aws.amazon.com/ecs/home?region=%v#/clusters/%v/tasks/%v/details",
event.Region, event.Region, clusterID, taskID)
taskDefinitionURL := fmt.Sprintf("https://%v.console.aws.amazon.com/ecs/home?region=%v#/taskDefinitions/%v",
event.Region, event.Region, strings.Replace(taskDefinitionID, ":", "/", 1))
text := fmt.Sprintf("*clusterID:%v* \n `state:%v` \n <%v|Task> (_<%v|%v>_)", clusterID, status.LastStatus, taskURL, taskDefinitionURL, taskDefinitionID)
var title string
var messsageLevel slack_reporter.SLACK_MESSAGE_LEVEL
switch ECS_STATE(status.LastStatus) {
case ECS_RUNNING_STATE:
title = fmt.Sprintf("*%vが起動に成功*", taskDefinitionID)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_OK
case ECS_STOPPED_STATE:
//もし期待された停止であるならば、OK
if status.DesiredStatus == string(ECS_STOPPED_STATE) {
title = fmt.Sprintf("*%vが停止に成功*", taskDefinitionID)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_OK
break
}
//そうでないならアラート
title = fmt.Sprintf("*%vが予期せぬ停止*", taskDefinitionID)
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_ALART
default:
messsageLevel = slack_reporter.SLACK_MESSAGE_LEVEL_NOTIFY
}
err = slack_reporter.ReportToLaboonSlack(webhookURL, channel, SLACK_MONITORING_PIPELINE_NAME, SLACK_MONITORING_PIPELINE_ICON, title, text, messsageLevel)
if err != nil {
log.Error(err)
return err
}
return nil
}
ポイント
- event.Sourceで処理を分岐し、events.CloudWatchEvent(json.RawMessage型)をそれぞれのstatusにunmarshalして扱う(そうすることで、監視するリソース毎に関数を分けない)
Terraform
monitoring module(今回のdirectory構成の外部にあります 詳細はこちら
main.tf
/* monitoring */
resource "aws_cloudwatch_event_rule" "default" {
count = length(var.cloud_watch_event_objs)
name = var.cloud_watch_event_objs[count.index]["cloud_watch_event_rule_name"]
description = var.cloud_watch_event_objs[count.index]["cloud_watch_event_rule_description"]
event_pattern = var.cloud_watch_event_objs[count.index]["event_pattern"]
}
resource "aws_cloudwatch_event_target" "default" {
count = length(aws_cloudwatch_event_rule.default)
rule = aws_cloudwatch_event_rule.default[count.index].name
target_id = var.cloud_watch_event_objs[count.index]["aws_cloudwatch_event_target_id"]
arn = var.cloud_watch_event_objs[count.index]["function_arn"]
}
resource "aws_lambda_permission" "default" {
count = length(aws_cloudwatch_event_target.default)
statement_id = var.cloud_watch_event_objs[count.index]["statement_id"]
action = "lambda:InvokeFunction"
function_name = aws_cloudwatch_event_target.default[count.index].arn
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.default[count.index].arn
}
variables.tf
/* required */
variable "cloud_watch_event_objs" {
//map(string)でもいいんだけどparameterとしてフィールドが見えてる方がわかりやすいかなと
type = list(object({
statement_id = string
cloud_watch_event_rule_name = string
cloud_watch_event_rule_description = string
event_pattern = string
aws_cloudwatch_event_target_id = string
function_arn = string
})
)
description = "watchしたいリソースとイベントパターンとlambda関数を打ち込む"
}
moduleを使う側(main.tf variables.tf)
main.tf
...
/* monitoring */
module "laboon_monitoring" {
source = "../../../../modules/common/monitoring"
cloud_watch_event_objs = local.cloud_watch_event_objs
}
/* locals */
locals {
/* monitoring required */
cloud_watch_event_objs = [
//この配列に監視したいリソースとイベントパターンと発火したい関数の入ったobjを追加する
//codepipeline
{
statement_id = "AllowExecutionFromCloudWatchForCodePipeline"
cloud_watch_event_rule_name = "ad-pipeline-notice"
cloud_watch_event_rule_description = "ad-pipeline-notice"
event_pattern = file("./event_pattern/codepipeline.json")
aws_cloudwatch_event_target_id = "ad-pipeline-notice"
function_arn = var.apex_function_pipeline_notice
},
//codebuild
{
statement_id = "AllowExecutionFromCloudWatchForCodeBuild"
cloud_watch_event_rule_name = "ad-codebuild-notice"
cloud_watch_event_rule_description = "ad-codebuild-notice"
event_pattern = file("./event_pattern/codebuild.json")
aws_cloudwatch_event_target_id = "ad-codebuild-notice"
function_arn = var.apex_function_pipeline_notice
},
//ecs
{
statement_id = "AllowExecutionFromCloudWatchForECS"
cloud_watch_event_rule_name = "ad-ecs-notice"
cloud_watch_event_rule_description = "ad-ecs-notice"
event_pattern = file("./event_pattern/ecs.json")
aws_cloudwatch_event_target_id = "ad-ecs-notice"
function_arn = var.apex_function_pipeline_notice
}
]
}
variables.tf
variable "apex_function_pipeline_notice" {}
variable "apex_function_pipeline_notice_name" {}
variable "apex_function_names" {}
variable "apex_function_role" {}
variable "aws_region" {}
variable "apex_environment" {}
variable "apex_function_arns" {}
event_pattern/codebuild.json
{
"source": [
"aws.codebuild"
],
"detail-type": [
"CodeBuild Build State Change"
]
}
event_pattern/codepipeline.json
{
"source": [
"aws.codepipeline"
],
"detail-type": [
"CodePipeline Pipeline Execution State Change"
],
"detail": {
"state": [
"RESUMED",
"CANCELED",
"STARTED",
"FAILED",
"SUCCEEDED"
]
}
}
event_pattern/ecs.json
{
"source": [
"aws.ecs"
],
"detail-type": [
"ECS Task State Change"
]
}
ポイント
- list(object)型を用いることで、簡単に監視リソースを増やせるモジュールに