AWS
Terraform

TerraformでAWSのスポットインスタンスを作成する


概要

最近仕事でDeepLearningをしていまして、

AWSでp系でスポットインスタンスを立ててちょっと学習させる、みたい事が増えてきました。

毎回コンソールでポチポチするのが手間だったのでTerraformでできるようにしました。


コード(2018/8/10修正)

※IAMロールの作成部分を2018/8/10に追加しました

公式ドキュメントを見ながら実装しました。

aws_spot_instance_requestを使おうとしてハマったのですが(自分が作ったVPCの指定ができなかった)、結局aws_spot_fleet_requestで台数を1台で指定することで解決しました。

あとポイントとしては、特にp系は1AZだとリソースが取れないことがあるので、複数AZを指定するようにしています。

ここにあげてます。(デフォルトでp2を使うようになっているので料金お気をつけください。)

こんな感じです。


main.tf

provider "aws" {

access_key = "${var.aws_access_key}"
secret_key = "${var.aws_secret_key}"
region = "${var.region}"
}

# IAMロール
data "aws_iam_policy_document" "assume_role" {
statement {
actions = ["sts:AssumeRole"]

principals {
type = "Service"
identifiers = ["spotfleet.amazonaws.com"]
}
}
}

resource "aws_iam_role" "spot-fleet-role" {
name = "ml-role"
assume_role_policy = "${data.aws_iam_policy_document.assume_role.json}"
}

resource "aws_iam_policy_attachment" "policy-attach" {
name = "ml-role-policy"
roles = ["${aws_iam_role.spot-fleet-role.id}"]
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"
}

# VPC
resource "aws_vpc" "ml-vpc" {
cidr_block = "10.1.0.0/16"
instance_tenancy = "default"
enable_dns_support = "true"
enable_dns_hostnames = "true"

tags {
Name = "ml-vpc"
}
}

# Internet Gateway
resource "aws_internet_gateway" "ml-igw" {
vpc_id = "${aws_vpc.ml-vpc.id}"

tags {
Name = "ml-igw"
}
}

# Subnet
resource "aws_subnet" "ml-subnet-public" {
count = "${length(var.availability_zones)}"
vpc_id = "${aws_vpc.ml-vpc.id}"
cidr_block = "${format("10.1.%d.0/24", count.index + 1)}"
availability_zone = "${lookup(var.availability_zones, count.index)}"
map_public_ip_on_launch = "true"

tags {
Name = "${format("ml-subnet-public-%d", count.index + 1)}"
}
}

# Route Table
resource "aws_route_table" "ml-route-public" {
vpc_id = "${aws_vpc.ml-vpc.id}"

route {
cidr_block = "0.0.0.0/0"
gateway_id = "${aws_internet_gateway.ml-igw.id}"
}

tags {
Name = "ml-route-table-public"
}
}

resource "aws_route_table_association" "ml-assoc" {
count = "${length(var.availability_zones)}"
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, count.index)}"
route_table_id = "${aws_route_table.ml-route-public.id}"
}

# Security Group
### Web
resource "aws_security_group" "ml-web-sg" {
name = "ml-web-sg"
description = "Allow SSH inbound traffic"
vpc_id = "${aws_vpc.ml-vpc.id}"

ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["${var.my_ip_address}"]
}

egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}

tags {
Name = "ml-web-sg"
}
}

# Key Pair
resource "aws_key_pair" "ml-key" {
key_name = "ml-key"
public_key = "${var.aws_public_key}"
}

data "aws_caller_identity" "current" {}

# Spot Fleet Request
resource "aws_spot_fleet_request" "ml-spot-request" {
iam_fleet_role = "${aws_iam_role.spot-fleet-role.arn}"

# spot_price = "0.1290" # Max Price デフォルトはOn-demand Price
target_capacity = "${var.spot_target_capacity}"
terminate_instances_with_expiration = true
wait_for_fulfillment = "true" # fulfillするまでTerraformが待つ

launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 0)}"
associate_public_ip_address = true

root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}

tags {
Name = "ml-instance"
}
}

launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 1)}"
associate_public_ip_address = true

root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}

tags {
Name = "ml-instance"
}
}

launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 2)}"
associate_public_ip_address = true

root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}

tags {
Name = "ml-instance"
}
}
}

data "aws_instance" "ml-instance" {
filter {
name = "tag:Name"
values = ["ml-instance"]
}

depends_on = ["aws_spot_fleet_request.ml-spot-request"]
}

output "ip" {
value = "${data.aws_instance.ml-instance.public_ip}"
depends_on = ["aws_spot_fleet_request.ml-spot-request"]
}



使い方

いつも通りです。


使う時

terraform plan

terraform apply


消す時

terraform plan --destroy

trraform destroy


その他

スポットリクエスト以外のリソースを毎回消して作るのも非効率だなと思って、

自分は以下のような感じで使ってます。

terraform apply --target=aws_spot_fleet_request.ml-spot-request

terraform destroy --target=aws_spot_fleet_request.ml-spot-request


今後の予定

次はEC2でS3をマウントさせる部分を自動化しようと思っています。