I am trying to deploy mlflow on ecs fargate using terraform. I am trying to set up a private subnet with a nat gateway, so that the ecs task will be able to pull the mlflow image from the docker hub. Here is my terraform:
data "aws_region" "current" {}
resource "aws_iam_role" "ecs_task" {
name = "mlflow-dev-ecs-task"
tags = local.tags
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
Effect = "Allow"
},
]
})
}
resource "aws_iam_role" "ecs_execution" {
name = "mlflow-dev-ecs-execution"
tags = local.tags
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
Effect = "Allow"
},
]
})
}
resource "aws_iam_role_policy_attachment" "ecs_execution" {
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
role = aws_iam_role.ecs_execution.name
}
resource "aws_security_group" "ecs_service" {
name = "mlflow-dev-ecs-service"
tags = local.tags
vpc_id = "vpc-XXXXXXXX"
ingress {
from_port = local.service_port
to_port = local.service_port
protocol = "tcp"
security_groups = [aws_security_group.lb.id]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_cloudwatch_log_group" "mlflow" {
name = "/aws/ecs/mlflow-dev"
retention_in_days = 90
tags = local.tags
}
resource "aws_ecs_cluster" "mlflow" {
name = "mlflow-dev"
tags = local.tags
}
resource "aws_ecs_task_definition" "mlflow" {
family = "mlflow-dev"
tags = local.tags
container_definitions = jsonencode(concat([
{
name = "mlflow"
image = "ghcr.io/mlflow/mlflow"
essential = true
# As of version 1.9.1, MLflow doesn't support specifying the backend store uri as an environment variable. ECS doesn't allow evaluating secret environment variables from within the command. Therefore, we are forced to override the entrypoint and assume the docker image has a shell we can use to interpolate the secret at runtime.
entryPoint = ["sh", "-c"]
command = [
"/bin/sh -c \"mlflow server --host=0.0.0.0 --port=${local.service_port} --default-artifact-root=s3://${local.artifact_bucket_id}/ --backend-store-uri=mysql+pymysql://${aws_rds_cluster.backend_store.master_username}:`echo -n $DB_PASSWORD`@${aws_rds_cluster.backend_store.endpoint}:${aws_rds_cluster.backend_store.port}/${aws_rds_cluster.backend_store.database_name} --gunicorn-opts '' \""
]
portMappings = [{ containerPort = local.service_port }]
secrets = [
{
name = "DB_PASSWORD"
valueFrom = aws_secretsmanager_secret.db_password.arn
},
]
logConfiguration = {
logDriver = "awslogs"
secretOptions = null
options = {
"awslogs-group" = aws_cloudwatch_log_group.mlflow.name
"awslogs-region" = data.aws_region.current.name
"awslogs-stream-prefix" = "cis"
}
}
},
], []))
network_mode = "awsvpc"
task_role_arn = aws_iam_role.ecs_task.arn
execution_role_arn = aws_iam_role.ecs_execution.arn
requires_compatibilities = ["FARGATE"]
# is this overkill?
cpu = 2048
memory = 4096
}
resource "aws_subnet" "mlflow-dev-service-subnet" {
vpc_id = "vpc-XXXXXXXX"
cidr_block = "XXXXXXXX"
map_public_ip_on_launch = "false" //it makes this a public subnet
availability_zone = "eu-west-1a"
tags = {
Name = "mlflow-dev-service-subnet"
}
}
resource "aws_ecs_service" "mlflow" {
name = "mlflow-dev"
cluster = aws_ecs_cluster.mlflow.id
task_definition = aws_ecs_task_definition.mlflow.arn
desired_count = 2
launch_type = "FARGATE"
platform_version = "1.4.0"
network_configuration {
subnets = [aws_subnet.mlflow-dev-service-subnet.id]
security_groups = [aws_security_group.ecs_service.id]
assign_public_ip = false
}
load_balancer {
target_group_arn = aws_lb_target_group.mlflow.arn
container_name = "mlflow"
container_port = local.service_port
}
lifecycle {
ignore_changes = [desired_count]
}
depends_on = [
aws_lb.mlflow,
]
}
resource "aws_appautoscaling_target" "mlflow" {
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.mlflow.name}/${aws_ecs_service.mlflow.name}"
scalable_dimension = "ecs:service:DesiredCount"
max_capacity = 2
min_capacity = 2
}
resource "aws_security_group" "lb" {
name = "mlflow-dev-lb"
tags = local.tags
vpc_id = "vpc-XXXXXXXXXX"
}
resource "aws_security_group_rule" "lb_ingress_http" {
description = "Only allow load balancer to reach the ECS service on the right port"
type = "ingress"
from_port = 80
to_port = 80
protocol = "tcp"
# should be cidr range of the vpc
# vpc.vpc_cidr_block
cidr_blocks = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
security_group_id = aws_security_group.lb.id
}
resource "aws_security_group_rule" "lb_ingress_https" {
description = "Only allow load balancer to reach the ECS service on the right port"
type = "ingress"
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
security_group_id = aws_security_group.lb.id
}
resource "aws_security_group_rule" "lb_egress" {
description = "Only allow load balancer to reach the ECS service on the right port"
type = "egress"
from_port = local.service_port
to_port = local.service_port
protocol = "tcp"
source_security_group_id = aws_security_group.ecs_service.id
security_group_id = aws_security_group.lb.id
}
resource "aws_lb" "mlflow" {
name = "mlflow-dev"
tags = local.tags
internal = true
load_balancer_type = "application"
security_groups = [aws_security_group.lb.id]
subnets = [aws_subnet.mlflow-dev-service-subnet.id, "subnet-0ae9eae7be10c1603"]
}
resource "aws_lb_target_group" "mlflow" {
name = "mlflow-dev"
port = local.service_port
protocol = "HTTP"
vpc_id = "vpc-XXXXXXXXX"
target_type = "ip"
health_check {
protocol = "HTTP"
matcher = "200-202"
path = "/health"
}
}
resource "aws_lb_listener" "http" {
load_balancer_arn = aws_lb.mlflow.arn
port = "80"
protocol = "HTTP"
default_action {
target_group_arn = aws_lb_target_group.mlflow.id
type = "forward"
}
}
resource "aws_eip" "nat_gateway" {
vpc = true
}
resource "aws_nat_gateway" "nat_gateway" {
allocation_id = aws_eip.nat_gateway.id
subnet_id = aws_subnet.mlflow-dev-service-subnet.id
tags = {
"Name" = "DevNatGateway"
}
}
output "nat_gateway_ip" {
value = aws_eip.nat_gateway.public_ip
}
resource "aws_route_table" "instance" {
vpc_id = "vpc-XXXXXXXXX"
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.nat_gateway.id
}
}
resource "aws_route_table_association" "instance" {
subnet_id = aws_subnet.mlflow-dev-service-subnet.id
route_table_id = aws_route_table.instance.id
}
Every time I deploy the above, the ecs task throws the following error:
CannotPullContainerError: pull image manifest has been retried 5 time(s): failed to resolve ref ghcr.io/mlflow/mlflow:latest: failed to do request: Head "https://ghcr.io/v2/mlflow/mlflow/manifests/latest": dial tcp XXX.XX.XXX.34:443: i/o timeout
I've tried to follow this example for setting up a NAT gateway (https://dev.betterdoc.org/infrastructure/2020/02/04/setting-up-a-nat-gateway-on-aws-using-terraform.html), and I can't figure out where I'm going wrong. Any help would be much appreciated.
You are creating a private subnet to deploy your ECS service, and then you are creating a NAT Gateway inside the same subnet. A NAT Gateway can't work in a private subnet. The NAT Gateway itself must be in a public subnet (a subnet with a route to an Internet Gateway). Your NAT Gateway currently doesn't have a route to the Internet, so it can't forward any outgoing traffic to the Internet.
Your configuration needs to be: