FargateSpotを活用したTerraform構成 ~ガンガン型編~

Last updated at 2023-06-29Posted at 2023-06-29

(もしまだ前回の記事をご覧なられてなければ先にそちらから見ていただくと背景等が分かりやすいかと思います)

背景と記事概要

前回の記事では参考元記事のバランス型を活用して実際にTerraformのコードを記載してみました。
今回はより積極的にコストカットを行えるガンガン型をTerraform化していこうと思います。

成果物イメージ

ガンガン型はECS Service,CloudWatch Alarm,AutoScalingをそれぞれFargate用,FargateSpot用で用意することにより実現するため、作成するリソース数は多くなります。
バリエーションを出すため、前回の記事と違い監視項目はALBの平均リクエスト数(RequestCountPerTarget)としています。

ひとまずコード

何はともあれ全体のソース。

VPCやらSecurityGroup,ECS Clusterやらまで書くと長くなるので割愛してます。
記載していないリソースが出てきたら裏でそれを記載しているものだと置き換えてください。

locals {
  default-name = "test"
}

#
# ECS
#
# Fargateのみを活用するサービス
resource "aws_ecs_service" "gangan-not-fargate-spot" {
  name             = "${local.default-name}-gangan-not-fargate-spot"
  cluster          = aws_ecs_cluster.fargate-spot.id
  task_definition  = aws_ecs_task_definition.task.arn
  desired_count    = 1
  launch_type      = "FARGATE"
  platform_version = "1.4.0"

  network_configuration {
    assign_public_ip = true
    subnets          = [aws_subnet.public-1a.id, aws_subnet.public-1c.id]
    security_groups  = [aws_security_group.ecs.id]
  }
  load_balancer {
    target_group_arn = aws_lb_target_group.gangan.arn
    container_name   = "test" # ここはコンテナ定義次第
    container_port   = 8080
  }
}
# FargateSpotのみを活用するサービス
resource "aws_ecs_service" "gangan-fargate-spot" {
  name             = "${local.default-name}-gangan-fargate-spot"
  cluster          = aws_ecs_cluster.fargate-spot.id
  task_definition  = aws_ecs_task_definition.task.arn
  desired_count    = 1
  platform_version = "1.4.0"

  network_configuration {
    assign_public_ip = true
    subnets          = [aws_subnet.public-1a.id, aws_subnet.public-1c.id]
    security_groups  = [aws_security_group.ecs.id]
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.gangan.arn
    container_name   = "test" # ここはコンテナ定義次第
    container_port   = 8080
  }
  capacity_provider_strategy {
    capacity_provider = "FARGATE_SPOT"
    base              = 1
    weight            = 1
  }
  capacity_provider_strategy {
    capacity_provider = "FARGATE"
    base              = 0
    weight            = 0
  }
}

#
# FargateSpot側のオートスケーリング設定
#
data "aws_iam_role" "ecs-service-autoscaling" {
  name = "AWSServiceRoleForApplicationAutoScaling_ECSService"
}
resource "aws_appautoscaling_target" "gangan-fargate-spot" {
  service_namespace  = "ecs"
  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  role_arn           = data.aws_iam_role.ecs-service-autoscaling.arn
  max_capacity       = 3
  min_capacity       = 1
}
resource "aws_appautoscaling_policy" "gangan-fargate-spot-scale-up" {
  name              = "${local.default-name}-scale-up"
  service_namespace = "ecs"

  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = 30
    metric_aggregation_type = "Average"

    step_adjustment {
      metric_interval_lower_bound = 0
      scaling_adjustment          = 1
    }
  }
}
resource "aws_appautoscaling_policy" "gangan-fargate-spot-scale-down" {
  name              = "${local.default-name}-scale-down"
  service_namespace = "ecs"

  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = 30
    metric_aggregation_type = "Average"

    step_adjustment {
      metric_interval_upper_bound = 0
      scaling_adjustment          = -1
    }
  }
}
resource "aws_cloudwatch_metric_alarm" "gangan-fargate-spot-request-high" {
  alarm_name = "${local.default-name}-alarm-fargate-spot-request-high"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "5" # (リクエスト数/コンテナ数)が規定の値超えたらalert状態にする

  dimensions = {
    TargetGroup = aws_lb_target_group.gangan.arn_suffix
  }
  alarm_actions = [aws_appautoscaling_policy.gangan-fargate-spot-scale-up.arn]
}
resource "aws_cloudwatch_metric_alarm" "gangan-fargate-spot-request-low" {
  alarm_name = "${local.default-name}-alarm-fargate-spot-request-low"

  comparison_operator = "LessThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "5" # (リクエスト数/コンテナ数)が規定の値下回ったらalert状態にする

  dimensions = {
    TargetGroup = aws_lb_target_group.gangan.arn_suffix
  }
  alarm_actions = [aws_appautoscaling_policy.gangan-fargate-spot-scale-down.arn]
}



#
# Fargate側のオートスケーリング設定
#
resource "aws_appautoscaling_target" "gangan-not-fargate-spot" {
  service_namespace  = "ecs"
  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-not-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  role_arn           = data.aws_iam_role.ecs-service-autoscaling.arn
  max_capacity       = 3
  min_capacity       = 1
}
resource "aws_appautoscaling_policy" "gangan-not-fargate-spot-scale-up" {
  name              = "${local.default-name}-scale-up"
  service_namespace = "ecs"

  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-not-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = 30
    metric_aggregation_type = "Average"

    step_adjustment {
      metric_interval_lower_bound = 0
      scaling_adjustment          = 1
    }
  }
}
resource "aws_appautoscaling_policy" "gangan-not-fargate-spot-scale-down" {
  name              = "${local.default-name}-scale-down"
  service_namespace = "ecs"

  resource_id        = "service/${aws_ecs_cluster.fargate-spot.name}/${aws_ecs_service.gangan-not-fargate-spot.name}"
  scalable_dimension = "ecs:service:DesiredCount"

  step_scaling_policy_configuration {
    adjustment_type         = "ChangeInCapacity"
    cooldown                = 30
    metric_aggregation_type = "Average"

    step_adjustment {
      metric_interval_upper_bound = 0
      scaling_adjustment          = -1
    }
  }
}
# metrics
resource "aws_cloudwatch_metric_alarm" "gangan-not-fargate-spot-request-high" {
  alarm_name = "${local.default-name}-alarm-not-fargate-spot-request-high"

  comparison_operator = "GreaterThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "20"

  dimensions = {
    TargetGroup = aws_lb_target_group.gangan.arn_suffix
  }
  alarm_actions = [aws_appautoscaling_policy.gangan-not-fargate-spot-scale-up.arn]
}
resource "aws_cloudwatch_metric_alarm" "gangan-not-fargate-spot-request-low" {
  alarm_name = "${local.default-name}-alarm-not-fargate-spot-request-low"

  comparison_operator = "LessThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "20"

  dimensions = {
    TargetGroup = aws_lb_target_group.gangan.arn_suffix
  }
  alarm_actions = [aws_appautoscaling_policy.gangan-not-fargate-spot-scale-down.arn]
}

解説

Serviceの作成

ガンガン型の要となるサービスの作成部分です。

# Fargateのみを活用するサービス
resource "aws_ecs_service" "gangan-not-fargate-spot" {
  # (省略)
}
# FargateSpotのみを活用するサービス
resource "aws_ecs_service" "gangan-fargate-spot" {
  # (省略)
  capacity_provider_strategy {
    capacity_provider = "FARGATE_SPOT"
    base              = 1
    weight            = 1
  }
  capacity_provider_strategy {
    capacity_provider = "FARGATE"
    base              = 0
    weight            = 0
  }
}

デフォルトの設定ではサービスにはキャパシティープロバイダー戦略は紐づいておらず全てFargateで起動します。
(aws_ecs_service.gangan-not-fargate-spotがよくある通常のパターンですね)

FargateSpotのみを起動するサービスを今回は用意したいのでaws_ecs_service.gangan-fargate-spotで定義してます。
capacity_provider = "FARGATE"の設定値をbase=0,weight=0とすることで常にFargateSpotでコンテナが立ちあがろうとします。

アラーム設定(Fargate,FargateSpot)

オートスケーリングにてスケールアウト・インするために必要なアラーム設定です。

# (一部省略して記載してます)

# FargateSpot
resource "aws_cloudwatch_metric_alarm" "gangan-fargate-spot-request-high" {
  comparison_operator = "GreaterThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "5"
}
resource "aws_cloudwatch_metric_alarm" "gangan-fargate-spot-request-low" {
  comparison_operator = "LessThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "5"
}

# Fargate
resource "aws_cloudwatch_metric_alarm" "gangan-not-fargate-spot-request-high" {
  comparison_operator = "GreaterThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "20"
}
resource "aws_cloudwatch_metric_alarm" "gangan-not-fargate-spot-request-low" {
  comparison_operator = "LessThanOrEqualToThreshold"
  datapoints_to_alarm = "1"
  evaluation_periods  = "1"
  metric_name         = "RequestCountPerTarget"
  namespace           = "AWS/ApplicationELB"
  period              = "60"
  statistic           = "Sum"
  threshold           = "20"
}

今回はリクエスト数の平均(リクエスト数 / コンテナ数)を利用してオートスケーリングするかどうかを判断させています。
ガンガン型ではFargateSpotをより積極的に活用していきたいため、Fargateよりも早いタイミングでスケールアウト・インが実施される必要があります。
したがって下記の関係が成り立つように各アラームの設定を行っています。
Fargateのthreshold > FargateSpotのthreshold

例えば毎分30リクエストあって既にコンテナが各サービスで1台ずつ立っている場合、以下のような形でスケールアウトされます。

アラーム状態か判断 → FargateSpot側のみアラーム状態になる
FargateSpot側のサービスにコンテナが追加される

詳細については参考元記事をご覧ください。

実際に適用しようとした時の注意点と使い所

バランス型、ガンガン型それぞれを実際にプロトタイプ的に作り、どういう特性があるのかを確認してみました。
実運用でこのバランス型、ガンガン型の反映を試みたのですが、その点についてもう少しだけ記載しようかと思います。

共通点かつ注意点

すでに存在しているサービスがあり、そこにcapacity_providerを適用しようとすると必ずリソースが再作成されます。
単純にcapacity_providerを設定したいだけなのになぁという不満点は残りますが、仕様上そうなってるため仕方ないようです。
(厳密には必ずじゃないけど)

(Optional) Capacity provider strategies to use for the service. Can be one or more. These can be updated without destroying and recreating the service only if force_new_deployment = true and not changing from 0 capacity_provider_strategy blocks to greater than 0, or vice versa. See below.

そのため絶対に止まってはいけないシステム(APIサーバーとか)でこのcapacity_providerを適用したい場合は以下のような流れをとる必要があります。

キャパシティープロバイダー戦略を適用した新しいECS Serviceを作成する
ALB等のターゲットに↑で作成されたコンテナを適用する
古いECS Serviceを削除する

バランス型の使い所

基本何でも。
定期的にスパイクする可能性のあるWebサーバーやらAPIサーバーやらに使っていただくと良きかと。
キャパシティープロバイダー戦略の設定によっていかようにでもできます。

ガンガン型の使い所

バランス型に比べてリソースが確保できない可能性が残るため、なくても最悪なんとかなるけど高速化を見込みたいみたいな時に使えそうです。
例えばSQSをポーリングして溜まっているメッセージを捌くようなシステムの場合、最悪FargateSpotが確保できなくても処理上は問題ないと判断できそうです。
(とはいえ運用でFargateSpotいくつか使ってますが、そもそも確保できないパターンが現状そんなに起きないです)

参考元

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up