概要
最近仕事でDeepLearningをしていまして、
AWSでp系でスポットインスタンスを立ててちょっと学習させる、みたい事が増えてきました。
毎回コンソールでポチポチするのが手間だったのでTerraformでできるようにしました。
コード(2018/8/10修正)
※IAMロールの作成部分を2018/8/10に追加しました
公式ドキュメントを見ながら実装しました。
aws_spot_instance_request
を使おうとしてハマったのですが(自分が作ったVPCの指定ができなかった)、結局aws_spot_fleet_request
で台数を1台で指定することで解決しました。
あとポイントとしては、特にp系は1AZだとリソースが取れないことがあるので、複数AZを指定するようにしています。
ここにあげてます。(デフォルトでp2を使うようになっているので料金お気をつけください。)
こんな感じです。
main.tf
provider "aws" {
access_key = "${var.aws_access_key}"
secret_key = "${var.aws_secret_key}"
region = "${var.region}"
}
# IAMロール
data "aws_iam_policy_document" "assume_role" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["spotfleet.amazonaws.com"]
}
}
}
resource "aws_iam_role" "spot-fleet-role" {
name = "ml-role"
assume_role_policy = "${data.aws_iam_policy_document.assume_role.json}"
}
resource "aws_iam_policy_attachment" "policy-attach" {
name = "ml-role-policy"
roles = ["${aws_iam_role.spot-fleet-role.id}"]
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole"
}
# VPC
resource "aws_vpc" "ml-vpc" {
cidr_block = "10.1.0.0/16"
instance_tenancy = "default"
enable_dns_support = "true"
enable_dns_hostnames = "true"
tags {
Name = "ml-vpc"
}
}
# Internet Gateway
resource "aws_internet_gateway" "ml-igw" {
vpc_id = "${aws_vpc.ml-vpc.id}"
tags {
Name = "ml-igw"
}
}
# Subnet
resource "aws_subnet" "ml-subnet-public" {
count = "${length(var.availability_zones)}"
vpc_id = "${aws_vpc.ml-vpc.id}"
cidr_block = "${format("10.1.%d.0/24", count.index + 1)}"
availability_zone = "${lookup(var.availability_zones, count.index)}"
map_public_ip_on_launch = "true"
tags {
Name = "${format("ml-subnet-public-%d", count.index + 1)}"
}
}
# Route Table
resource "aws_route_table" "ml-route-public" {
vpc_id = "${aws_vpc.ml-vpc.id}"
route {
cidr_block = "0.0.0.0/0"
gateway_id = "${aws_internet_gateway.ml-igw.id}"
}
tags {
Name = "ml-route-table-public"
}
}
resource "aws_route_table_association" "ml-assoc" {
count = "${length(var.availability_zones)}"
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, count.index)}"
route_table_id = "${aws_route_table.ml-route-public.id}"
}
# Security Group
### Web
resource "aws_security_group" "ml-web-sg" {
name = "ml-web-sg"
description = "Allow SSH inbound traffic"
vpc_id = "${aws_vpc.ml-vpc.id}"
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["${var.my_ip_address}"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags {
Name = "ml-web-sg"
}
}
# Key Pair
resource "aws_key_pair" "ml-key" {
key_name = "ml-key"
public_key = "${var.aws_public_key}"
}
data "aws_caller_identity" "current" {}
# Spot Fleet Request
resource "aws_spot_fleet_request" "ml-spot-request" {
iam_fleet_role = "${aws_iam_role.spot-fleet-role.arn}"
# spot_price = "0.1290" # Max Price デフォルトはOn-demand Price
target_capacity = "${var.spot_target_capacity}"
terminate_instances_with_expiration = true
wait_for_fulfillment = "true" # fulfillするまでTerraformが待つ
launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 0)}"
associate_public_ip_address = true
root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}
tags {
Name = "ml-instance"
}
}
launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 1)}"
associate_public_ip_address = true
root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}
tags {
Name = "ml-instance"
}
}
launch_specification {
ami = "${var.spot_instance_ami}"
instance_type = "${var.spot_instance_type}"
key_name = "${aws_key_pair.ml-key.key_name}"
vpc_security_group_ids = ["${aws_security_group.ml-web-sg.id}"]
subnet_id = "${element(aws_subnet.ml-subnet-public.*.id, 2)}"
associate_public_ip_address = true
root_block_device {
volume_size = "${var.gp2_volume_size}"
volume_type = "gp2"
}
tags {
Name = "ml-instance"
}
}
}
data "aws_instance" "ml-instance" {
filter {
name = "tag:Name"
values = ["ml-instance"]
}
depends_on = ["aws_spot_fleet_request.ml-spot-request"]
}
output "ip" {
value = "${data.aws_instance.ml-instance.public_ip}"
depends_on = ["aws_spot_fleet_request.ml-spot-request"]
}
使い方
いつも通りです。
使う時
terraform plan
terraform apply
消す時
terraform plan --destroy
trraform destroy
その他
スポットリクエスト以外のリソースを毎回消して作るのも非効率だなと思って、
自分は以下のような感じで使ってます。
terraform apply --target=aws_spot_fleet_request.ml-spot-request
terraform destroy --target=aws_spot_fleet_request.ml-spot-request
今後の予定
次はEC2でS3をマウントさせる部分を自動化しようと思っています。