TerraformでRDS→Glue→Redshift構築
前提
AWS CLI使えるぜ
Terraform使えるぜ
出来上がるもの
VPC
IAM
RDS
Redshift
Glue
※クローラーの実行、JOBの作成はGUIからやる想定
ネットワーク
resource "aws_vpc" "vpc_glue_test" {
cidr_block = "10.0.0.0/16"
instance_tenancy = "default"
enable_dns_support = true
enable_dns_hostnames = true
tags = {
Name = "vpc_glue_test"
}
}
resource "aws_route_table" "route_table_glue_test" {
vpc_id = aws_vpc.vpc_glue_test.id
tags = {
Name = "route_table_glue_test"
}
}
resource "aws_route_table_association" "route_association1_glue_test" {
subnet_id = aws_subnet.subnet1_glue_test.id
route_table_id = aws_route_table.route_table_glue_test.id
}
resource "aws_route_table_association" "route_association2_glue_test" {
subnet_id = aws_subnet.subnet2_glue_test.id
route_table_id = aws_route_table.route_table_glue_test.id
}
resource "aws_subnet" "subnet1_glue_test" {
vpc_id = aws_vpc.vpc_glue_test.id
availability_zone = "ap-northeast-1a"
cidr_block = "10.0.1.0/24"
tags = {
Name = "subnet1_glue_test"
}
}
resource "aws_subnet" "subnet2_glue_test" {
vpc_id = aws_vpc.vpc_glue_test.id
availability_zone = "ap-northeast-1c"
cidr_block = "10.0.2.0/24"
tags = {
Name = "subnet2_glue_test"
}
}
resource "aws_security_group" "securty_group_glue_test" {
name = "securty_group_glue_test"
vpc_id = aws_vpc.vpc_glue_test.id
ingress {
from_port = 0
to_port = 65535
protocol = "tcp"
self = true
}
egress {
from_port = 0
to_port = 65535
protocol = "tcp"
self = true
}
tags = {
Name = "securty_group_glue_test"
}
}
resource "aws_vpc_endpoint" "vpc_endpoint_glue_test" {
vpc_id = aws_vpc.vpc_glue_test.id
service_name = "com.amazonaws.ap-northeast-1.s3"
tags = {
Environment = "vpc_endpoint_glue_test"
}
route_table_ids = [
aws_route_table.route_table_glue_test.id
]
}
セキュリティグループ
Glueのクローラーが這い回る用のセキュリティグループです。
インバウンドとアウトバウンドを「自己参照」で全てのTCPを許可します。
自己参照のセキュリティグループ:
同セキュリティグループ内に属する者だけがすり抜けられます。
参考URL:
https://dev.classmethod.jp/articles/getting-started-aws-glue-access-data-stores-settings/
サブネット
2つあるのは、RDSを立てる際に2つ以上指定しないといけないからです。
エンドポイント
Glueをインターネットに出ることなくVPC内で完結させるためのVPCエンドポイントです。
Glueの内部処理でS3バケットを使用するため、S3フルアクセスのVPCエンドポイントが必要になります。
IAM
resource "aws_iam_instance_profile" "role_profile_glue_test" {
name = "instance_role"
role = aws_iam_role.role_glue_test.name
}
resource "aws_iam_role" "role_glue_test" {
name = "role_glue_test"
assume_role_policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
resource "aws_iam_role_policy" "role_policy_glue_test" {
name = "role_policy_glue_test"
role = aws_iam_role.role_glue_test.id
policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"glue:*",
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:ListAllMyBuckets",
"s3:GetBucketAcl",
"ec2:DescribeVpcEndpoints",
"ec2:DescribeRouteTables",
"ec2:CreateNetworkInterface",
"ec2:DeleteNetworkInterface",
"ec2:DescribeNetworkInterfaces",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSubnets",
"ec2:DescribeVpcAttribute",
"iam:ListRolePolicies",
"iam:GetRole",
"iam:GetRolePolicy",
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:CreateBucket"
],
"Resource": [
"arn:aws:s3:::aws-glue-*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject"
],
"Resource": [
"arn:aws:s3:::aws-glue-*/*",
"arn:aws:s3:::*/*aws-glue-*/*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:GetObject"
],
"Resource": [
"arn:aws:s3:::crawler-public*",
"arn:aws:s3:::aws-glue-*"
]
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:*:*:/aws-glue/*"
]
},
{
"Effect": "Allow",
"Action": [
"ec2:CreateTags",
"ec2:DeleteTags"
],
"Condition": {
"ForAllValues:StringEquals": {
"aws:TagKeys": [
"aws-glue-service-resource"
]
}
},
"Resource": [
"arn:aws:ec2:*:*:network-interface/*",
"arn:aws:ec2:*:*:security-group/*",
"arn:aws:ec2:*:*:instance/*"
]
},
{
"Effect": "Allow",
"Action": "s3:*",
"Resource": "*"
}
]
}
EOF
}
信頼されたエンティティ
Glueが徘徊する用に必要なので"Principal"はGlueです。
「IAM/ロール/<ロール名>/信頼関係/ポリシードキュメントの表示」で閲覧可能です。
ポリシー
「AWSGlueServiceRole」と「AmazonS3FullAccess」をアタッチします。
RDS
resource "aws_db_subnet_group" "rds_subnet_glue_test" {
name = "rds_subnet_glue_test"
subnet_ids = ["${aws_subnet.subnet1_glue_test.id}",
"${aws_subnet.subnet2_glue_test.id}"]
tags = {
Name = "subnet_glue_test"
}
}
resource "aws_db_instance" "rds_glue_test" {
identifier = "rds-glue-test"
allocated_storage = 20
storage_type = "gp2"
engine = "postgres"
engine_version = "11.5"
instance_class = "db.t3.micro"
name = "testdb"
username = "testuser"
password = "testpassword"
vpc_security_group_ids = ["${aws_security_group.securty_group_glue_test.id}"]
db_subnet_group_name = aws_db_subnet_group.rds_subnet_glue_test.id
skip_final_snapshot = true
}
サブネット
指定したいのはVPC。VPCを指定したいために"aws_db_subnet_group"を指定しています。
セキュリティグループ
VPC内に作った自己参照セキュリティグループ内に配置します。
skip_final_snapshot
これtrueにしておかないとデストロイに手間取りました。
本番DBであればfalseでいいかと思います。
RedShift
resource "aws_redshift_subnet_group" "redshift_subnet_glue_test" {
name = "redshift-subnet-glue-test"
subnet_ids = ["${aws_subnet.subnet1_glue_test.id}",
"${aws_subnet.subnet2_glue_test.id}"]
tags = {
environment = "subnet_glue_test"
}
}
resource "aws_redshift_cluster" "redshift_glue_test" {
cluster_identifier = "redshift-glue-test"
database_name = "testdwh"
master_username = "testuser"
master_password = "Test2020"
node_type = "dc1.large"
cluster_type = "single-node"
publicly_accessible = false
skip_final_snapshot = true
cluster_subnet_group_name = aws_redshift_subnet_group.redshift_subnet_glue_test.name
vpc_security_group_ids = ["${aws_security_group.securty_group_glue_test.id}"]
}
サブネット/セキュリティグループ/skip_final_snapshot
やりたいことはRDSと同じ。
Glue
resource "aws_glue_catalog_database" "database_glue_test" {
name = "database_glue_test"
}
resource "aws_glue_connection" "rds_connection_glue_test" {
connection_properties = {
JDBC_CONNECTION_URL = "jdbc:postgresql://${aws_db_instance.rds_glue_test.endpoint}/testdb"
PASSWORD = "testpassword"
USERNAME = "testuser"
}
name = "rds_connection_glue_test"
physical_connection_requirements {
availability_zone = aws_subnet.subnet1_glue_test.availability_zone
security_group_id_list = [aws_security_group.securty_group_glue_test.id]
subnet_id = aws_subnet.subnet1_glue_test.id
}
}
resource "aws_glue_connection" "redshift_connection_glue_test" {
connection_properties = {
JDBC_CONNECTION_URL = "jdbc:postgresql://${aws_redshift_cluster.redshift_glue_test.endpoint}/testdwh"
PASSWORD = "Test2020"
USERNAME = "testuser"
}
name = "redshift_connection_glue_test"
physical_connection_requirements {
availability_zone = aws_subnet.subnet1_glue_test.availability_zone
security_group_id_list = [aws_security_group.securty_group_glue_test.id]
subnet_id = aws_subnet.subnet1_glue_test.id
}
}
resource "aws_glue_crawler" "crawler_glue_test" {
database_name = aws_glue_catalog_database.database_glue_test.name
name = "database_glue_test"
role = aws_iam_role.role_glue_test.arn
jdbc_target {
connection_name = aws_glue_connection.rds_connection_glue_test.name
path = "testdb/%"
}
}
データベース
"aws_glue_catalog_database" で箱だけ作るイメージです。
コネクション
"aws_glue_connection"でJDBC接続の設定を入れてあげます。
クローラー
どこに潜ってデータカタログを作成するかを記述しています。
最後に
あとはGUIでクローラの実行してあげれば
RDS内のテーブルメタデータをひっぱてきてくれる。