はじめに
こんにちわ。Wano株式会社のエンジニアのnariと申します。
今回は、前回の記事で、EC2のメトリクスを要件を満たす形で収集することはできるようになりました。
しかしこのメトリクスに対してアラートを貼ろうとすると、AutoScalingを設定しているインスタンスに関しては、いちいちスケールするたびにアラートを削除したり増やしたりする必要があります。
それを手動でやるのは流石に現実的ではないため、[前の記事](ApexとTerraformとGoでAWS上に構築したCD Pipelineのステータスをslackに通知 - Qiita)で作成したステータスチェックシステム(Apex,Terraform)とLambda(Go)で動的に増減するアラームの設定をすることにしました。
システム全体像
どう設定したか
1.LambdaのRoleに以下のpolicyを設定する
policy.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": "arn:aws:logs:*:*:*"
},
{
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"cloudwatch:PutMetricAlarm",
"cloudwatch:DeleteAlarms",
"cloudwatch:DescribeAlarms"
],
"Resource": "*"
}
]
}
2.Lambda(Go)の以下のスクリプトをデプロイする(Apex)
- AWS CloudWatch Agent導入+LoadAverage送信cronをTerraformとuserDataで設定して、デフォルトのEC2メトリクスを補填する - Qiitaで書いた中で、CPU,Memory,Disk,LoadAverageのメトリクスに対してアラートをつけたい
- scaleStatusが"Launching"の時はアラームを付加し、"Terminating"の時はアラームを剥がす
main.go
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/aws/aws-lambda-go/events"
"github.com/aws/aws-lambda-go/lambda"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/cloudwatch"
"github.com/labstack/gommon/log"
"os"
"strings"
)
type AutoScalingStatus struct {
AutoScalingGroupName string
EC2InstanceId string
Description string
}
func main() {
lambda.Start(controlAutoScalingInstanceHandler)
}
func controlAutoScalingInstanceHandler(context context.Context, event events.CloudWatchEvent) (e error) {
status := &AutoScalingStatus{}
err := json.Unmarshal([]byte(event.Detail), status)
if err != nil {
log.Error(err)
return err
}
cw := cloudwatch.New(session.New(), &aws.Config{Region: aws.String("ap-northeast-1")})
snsArn := os.Getenv("SNS_ARN")
splits := strings.Split(status.Description, " ")
scaleStatus := splits[0]
log.Infof("splits:%v", splits)
switch scaleStatus {
case "Launching":
return createAlarm(status, cw, snsArn)
case "Terminating":
return deleteAlarm(status, cw, snsArn)
default:
log.Info("該当のstatusTypeではない")
return nil
}
}
func createAlarm(status *AutoScalingStatus, svc *cloudwatch.CloudWatch, snsArn string) error {
for _, input := range getAlarmInputs(status, snsArn) {
putMetricAlarmOutput, err := svc.PutMetricAlarm(input)
if err != nil {
log.Error(err)
return err
}
log.Infof("putMetricAlarmOutput:%v", putMetricAlarmOutput)
}
return nil
}
func deleteAlarm(status *AutoScalingStatus, svc *cloudwatch.CloudWatch, snsArn string) error {
loadAverageAlarmName := fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
memoryAlarmName := fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
cpu0AlarmName := fmt.Sprintf("cpu_usage_system-cpu0-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
cpu1AlarmName := fmt.Sprintf("cpu_usage_system-cpu1-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
diskAlarmName := fmt.Sprintf("LoadAverage-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
input := cloudwatch.DeleteAlarmsInput{
AlarmNames: []*string{
&loadAverageAlarmName,
&memoryAlarmName,
&cpu0AlarmName,
&cpu1AlarmName,
&diskAlarmName,
},
}
deleteAlarmOutput, err := svc.DeleteAlarms(&input)
if err != nil {
log.Error(err)
return err
}
log.Infof("deleteAlarmOutput:%v", deleteAlarmOutput)
return nil
}
func getAlarmInputs(status *AutoScalingStatus, snsArn string) []*cloudwatch.PutMetricAlarmInput {
return append([]*cloudwatch.PutMetricAlarmInput{
getDiscAlarmInput(status, snsArn),
getLoadAverageAlarmInput(status, snsArn),
getMemoryAlarmInput(status, snsArn),
}, getCPUAlarmInput(status, snsArn)...)
}
func getDiscAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"
dimensionNameDivice string = "device"
dimensionValueDivice string = "tmpfs"
dimensionNameFstype string = "fstype"
dimensionValueFstype string = "tmpfs"
dimensionNamePath string = "path"
dimensionValuePath string = "/dev/shm"
evaluationPeriods int64 = 2
threshold float64 = 80.0
alarmName string = fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "disk_used_percent"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("disk_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
{
Name: &dimensionNameDivice,
Value: &dimensionValueDivice,
},
{
Name: &dimensionNameFstype,
Value: &dimensionValueFstype,
},
{
Name: &dimensionNamePath,
Value: &dimensionValuePath,
},
}
)
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}
func getMemoryAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"
evaluationPeriods int64 = 2
threshold float64 = 80.0
alarmName string = fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "mem_used_percent"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("mem_used_percent-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
}
)
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}
func getCPUAlarmInput(status *AutoScalingStatus, snsArn string) []*cloudwatch.PutMetricAlarmInput {
var (
dimensionNameAutoScalingGroupName string = "AutoScalingGroupName"
dimensionNameInstanceID string = "InstanceId"
dimensionNameCPU string = "cpu"
dimensionValuesCPU []string = []string{"cpu0", "cpu1"}
evaluationPeriods int64 = 2
threshold float64 = 60.0
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = "cpu_usage_system"
namespace string = "CWAgent"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
)
inputs := []*cloudwatch.PutMetricAlarmInput{}
for _, cpu := range dimensionValuesCPU {
dimension := []*cloudwatch.Dimension{
{
Name: &dimensionNameAutoScalingGroupName,
Value: &status.AutoScalingGroupName,
},
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
{
Name: &dimensionNameCPU,
Value: &cpu,
},
}
alarmName := fmt.Sprintf("cpu_usage_system-%v-%v-%v", cpu, status.AutoScalingGroupName, status.EC2InstanceId)
alarmDescription := alarmName
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescription,
}
inputs = append(inputs, input)
}
return inputs
}
func getLoadAverageAlarmInput(status *AutoScalingStatus, snsArn string) *cloudwatch.PutMetricAlarmInput {
var (
dimensionNameInstanceID string = "InstanceId"
evaluationPeriods int64 = 2
threshold float64 = 10.0
alarmName string = fmt.Sprintf("LoadAverage-%v-%v", status.AutoScalingGroupName, status.EC2InstanceId)
comparisonOperator string = cloudwatch.ComparisonOperatorGreaterThanThreshold
metricName string = fmt.Sprintf("%v/LoadAverage", status.EC2InstanceId)
namespace string = "AmazonLinux/LoadAverage"
period int64 = 60
statistic string = cloudwatch.StatisticAverage
alarmDescriotion string = fmt.Sprintf("LoadAverage-%v", status.EC2InstanceId)
dimension []*cloudwatch.Dimension = []*cloudwatch.Dimension{
{
Name: &dimensionNameInstanceID,
Value: &status.EC2InstanceId,
},
}
)
input := &cloudwatch.PutMetricAlarmInput{
AlarmActions: []*string{&snsArn},
OKActions: []*string{&snsArn},
AlarmName: &alarmName,
ComparisonOperator: &comparisonOperator,
EvaluationPeriods: &evaluationPeriods,
Threshold: &threshold,
Dimensions: dimension,
MetricName: &metricName,
Namespace: &namespace,
Period: &period,
Statistic: &statistic,
AlarmDescription: &alarmDescriotion,
}
return input
}
3.AutoScalingを既存のステータス監視機構に追加する
- ApexとTerraformとGoでAWS上に構築したCD Pipelineのステータスをslackに通知 - Qiitaで作成した、リソースのステータス監視にAutoScalingも追加する(以下のeventに対して)
autoscaling.json
{
"source": [
"aws.autoscaling"
],
"detail-type": [
"EC2 Instance Launch Successful",
"EC2 Instance Terminate Successful"
]
}