Edited at

AWS AutoScalingで増減したEC2インスタンスに、動的にCloudWatchのAlarmをLambda(Go)で設定する


はじめに

こんにちわ。Wano株式会社のエンジニアのnariと申します。

今回は、前回の記事で、EC2のメトリクスを要件を満たす形で収集することはできるようになりました。

しかしこのメトリクスに対してアラートを貼ろうとすると、AutoScalingを設定しているインスタンスに関しては、いちいちスケールするたびにアラートを削除したり増やしたりする必要があります。

それを手動でやるのは流石に現実的ではないため、前の記事で作成したステータスチェックシステム(Apex,Terraform)とLambda(Go)で動的に増減するアラームの設定をすることにしました。


システム全体像

スクリーンショット 2019-08-12 21.50.30.png


どう設定したか


1.LambdaのRoleに以下のpolicyを設定する


policy.json

{

"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": "arn:aws:logs:*:*:*"
},
{
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"cloudwatch:PutMetricAlarm",
"cloudwatch:DeleteAlarms",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}


2.Lambda(Go)の以下のスクリプトをデプロイする(Apex)


main.go

package main

import (
"context"
"encoding/json"
"fmt"
"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/cloudwatch"
"github.com/labstack/gommon/log"
"os"
"strings"
)

type AutoScalingStatus struct {
AutoScalingGroupName string
EC2InstanceId string
Description string
}

func main() {
lambda.Start(controlAutoScalingInstanceHandler)
}

func controlAutoScalingInstanceHandler(context context.Context, event events.CloudWatchEvent) (e error) {
status := &AutoScalingStatus{}
err := json.Unmarshal([]byte(event.Detail), status)
if err != nil {
log.Error(err)
return err
}

cw := cloudwatch.New(session.New(), &aws.Config{Region: aws.String("ap-northeast-1")})
snsArn := os.Getenv("SNS_ARN")

splits := strings.Split(status.Description, " ")
scaleStatus := splits[0]
log.Infof("splits:%v", splits)
switch scaleStatus {
case "Launching":
return createAlarm(status, cw, snsArn)
case "Terminating":
return deleteAlarm(status, cw, snsArn)
default:
log.Info("該当のstatusTypeではない")
return nil
}
}

func createAlarm(status *AutoScalingStatus, svc *cloudwatch.CloudWatch, snsArn string) error {
for _, input := range getAlarmInputs(status, snsArn) {
putMetricAlarmOutput, err := svc.PutMetricAlarm(input)
if err != nil {
log.Error(err)
return err
}
log.Infof("putMetricAlarmOutput:%v", putMetricAlarmOutput)
}
return nil
}

func deleteAlarm(status *AutoScalingStatus, svc *cloudwatch.CloudWatch, snsArn string) error {
loadAverageAlarmName := fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
memoryAlarmName := fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
cpu0AlarmName := fmt.Sprintf("cpu_usage_system-cpu0-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
cpu1AlarmName := fmt.Sprintf("cpu_usage_system-cpu1-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
diskAlarmName := fmt.Sprintf("LoadAverage-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
input := cloudwatch.DeleteAlarmsInput{
AlarmNames: []*string{
&loadAverageAlarmName,
&memoryAlarmName,
&cpu0AlarmName,
&cpu1AlarmName,
&diskAlarmName,
},
}

deleteAlarmOutput, err := svc.DeleteAlarms(&input)
if err != nil {
log.Error(err)
return err
}
log.Infof("deleteAlarmOutput:%v", deleteAlarmOutput)
return nil
}

func getAlarmInputs(status *AutoScalingStatus, snsArn string) []*cloudwatch.PutMetricAlarmInput {
return append([]*cloudwatch.PutMetricAlarmInput{
getDiscAlarmInput(status, snsArn),
getLoadAverageAlarmInput(status, snsArn),
getMemoryAlarmInput(status, snsArn),
}, getCPUAlarmInput(status, snsArn)...)
}
func getDiscAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"

dimensionNameDivice string = "device"
dimensionValueDivice string = "tmpfs"

dimensionNameFstype string = "fstype"
dimensionValueFstype string = "tmpfs"

dimensionNamePath string = "path"
dimensionValuePath string = "/dev/shm"

evaluationPeriods int64 = 2
threshold float64 = 80.0
alarmName string = fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "disk_used_percent"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
{
Name: &dimensionNameDivice,
Value: &dimensionValueDivice,
},
{
Name: &dimensionNameFstype,
Value: &dimensionValueFstype,
},
{
Name: &dimensionNamePath,
Value: &dimensionValuePath,
},
}
)
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}

func getMemoryAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"
evaluationPeriods int64 = 2
threshold float64 = 80.0
alarmName string = fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "mem_used_percent"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
}
)
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}

func getCPUAlarmInput(status *AutoScalingStatus, snsArn string) []*cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"
dimensionNameCPU string = "cpu"
dimensionValuesCPU []string = []string{"cpu0", "cpu1"}
evaluationPeriods int64 = 2
threshold float64 = 60.0
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "cpu_usage_system"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
)
inputs := []*cloudwatch.PutMetricAlarmInput{}
for _, cpu := range dimensionValuesCPU {
dimension := []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
{
Name: &dimensionNameCPU,
Value: &cpu,
},
}
alarmName := fmt.Sprintf("cpu_usage_system-%v-%v-%v", cpu, status.AutoScalingGroupName, status.EC2InstanceId)
alarmDescription := alarmName
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescription,
}
inputs = append(inputs, input)
}
return inputs
}

func getLoadAverageAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameInstanceID string = "InstanceId"
evaluationPeriods int64 = 2
threshold float64 = 10.0
alarmName string = fmt.Sprintf("LoadAverage-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = fmt.Sprintf("%v/LoadAverage", status.EC2InstanceId)
namespace string = "AmazonLinux/LoadAverage"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("LoadAverage-%v", status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
}
)

input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}



3.AutoScalingを既存のステータス監視機構に追加する


autoscaling.json

{

"source": [
"aws.autoscaling"
],
"detail-type": [
"EC2 Instance Launch Successful",
"EC2 Instance Terminate Successful"
]
}


参考文献