Why can't my ecs task pull a container from the docker hub?

279 Views Asked by At

I am trying to deploy mlflow on ecs fargate using terraform. I am trying to set up a private subnet with a nat gateway, so that the ecs task will be able to pull the mlflow image from the docker hub. Here is my terraform:

    data "aws_region" "current" {}

resource "aws_iam_role" "ecs_task" {
  name = "mlflow-dev-ecs-task"
  tags = local.tags

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
        Effect = "Allow"
      },
    ]
  })
}

resource "aws_iam_role" "ecs_execution" {
  name = "mlflow-dev-ecs-execution"
  tags = local.tags

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
        Effect = "Allow"
      },
    ]
  })
}

resource "aws_iam_role_policy_attachment" "ecs_execution" {
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
  role       = aws_iam_role.ecs_execution.name
}

resource "aws_security_group" "ecs_service" {
  name = "mlflow-dev-ecs-service"
  tags = local.tags

  vpc_id = "vpc-XXXXXXXX"

  ingress {
    from_port       = local.service_port
    to_port         = local.service_port
    protocol        = "tcp"
    security_groups = [aws_security_group.lb.id]
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

resource "aws_cloudwatch_log_group" "mlflow" {
  name              = "/aws/ecs/mlflow-dev"
  retention_in_days = 90
  tags              = local.tags
}

resource "aws_ecs_cluster" "mlflow" {
  name = "mlflow-dev"
  tags = local.tags
}

resource "aws_ecs_task_definition" "mlflow" {
  family = "mlflow-dev"
  tags   = local.tags
  container_definitions = jsonencode(concat([
    {
      name      = "mlflow"
      image     = "ghcr.io/mlflow/mlflow"
      essential = true

      # As of version 1.9.1, MLflow doesn't support specifying the backend store uri as an environment variable. ECS doesn't allow evaluating secret environment variables from within the command. Therefore, we are forced to override the entrypoint and assume the docker image has a shell we can use to interpolate the secret at runtime.
      entryPoint = ["sh", "-c"]
      command = [
        "/bin/sh -c \"mlflow server --host=0.0.0.0 --port=${local.service_port} --default-artifact-root=s3://${local.artifact_bucket_id}/ --backend-store-uri=mysql+pymysql://${aws_rds_cluster.backend_store.master_username}:`echo -n $DB_PASSWORD`@${aws_rds_cluster.backend_store.endpoint}:${aws_rds_cluster.backend_store.port}/${aws_rds_cluster.backend_store.database_name} --gunicorn-opts '' \""
      ]
      portMappings = [{ containerPort = local.service_port }]
      secrets = [
        {
          name      = "DB_PASSWORD"
          valueFrom = aws_secretsmanager_secret.db_password.arn
        },
      ]
      logConfiguration = {
        logDriver     = "awslogs"
        secretOptions = null
        options = {
          "awslogs-group"         = aws_cloudwatch_log_group.mlflow.name
          "awslogs-region"        = data.aws_region.current.name
          "awslogs-stream-prefix" = "cis"
        }
      }
    },
  ], []))

  network_mode             = "awsvpc"
  task_role_arn            = aws_iam_role.ecs_task.arn
  execution_role_arn       = aws_iam_role.ecs_execution.arn
  requires_compatibilities = ["FARGATE"]
  # is this overkill?
  cpu                      = 2048
  memory                   = 4096
}

resource "aws_subnet" "mlflow-dev-service-subnet" {
  vpc_id                  = "vpc-XXXXXXXX"
  cidr_block              = "XXXXXXXX"  
  map_public_ip_on_launch = "false" //it makes this a public subnet
  availability_zone       = "eu-west-1a"
  tags = {
    Name = "mlflow-dev-service-subnet"
  }
}

resource "aws_ecs_service" "mlflow" {
  name             = "mlflow-dev"
  cluster          = aws_ecs_cluster.mlflow.id
  task_definition  = aws_ecs_task_definition.mlflow.arn
  desired_count    = 2
  launch_type      = "FARGATE"
  platform_version = "1.4.0"


  network_configuration {
    subnets         = [aws_subnet.mlflow-dev-service-subnet.id]
    security_groups = [aws_security_group.ecs_service.id]
    assign_public_ip = false
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.mlflow.arn
    container_name   = "mlflow"
    container_port   = local.service_port
  }

  lifecycle {
    ignore_changes = [desired_count]
  }

  depends_on = [
    aws_lb.mlflow,
  ]
}

resource "aws_appautoscaling_target" "mlflow" {
  service_namespace  = "ecs"
  resource_id        = "service/${aws_ecs_cluster.mlflow.name}/${aws_ecs_service.mlflow.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  max_capacity       = 2
  min_capacity       = 2
}

resource "aws_security_group" "lb" {
  name   = "mlflow-dev-lb"
  tags   = local.tags
  vpc_id = "vpc-XXXXXXXXXX"
}

resource "aws_security_group_rule" "lb_ingress_http" {
  description       = "Only allow load balancer to reach the ECS service on the right port"
  type              = "ingress"
  from_port         = 80
  to_port           = 80
  protocol          = "tcp"
  # should be cidr range of the vpc
  # vpc.vpc_cidr_block
  cidr_blocks       = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
  security_group_id = aws_security_group.lb.id
}

resource "aws_security_group_rule" "lb_ingress_https" {
  description       = "Only allow load balancer to reach the ECS service on the right port"
  type              = "ingress"
  from_port         = 443
  to_port           = 443
  protocol          = "tcp"
  cidr_blocks       = ["XXX.XX.0.0/16"] # cidr block of mlflow-dev-service-subnet
  security_group_id = aws_security_group.lb.id
}

resource "aws_security_group_rule" "lb_egress" {
  description              = "Only allow load balancer to reach the ECS service on the right port"
  type                     = "egress"
  from_port                = local.service_port
  to_port                  = local.service_port
  protocol                 = "tcp"
  source_security_group_id = aws_security_group.ecs_service.id
  security_group_id        = aws_security_group.lb.id
}

resource "aws_lb" "mlflow" {
  name               = "mlflow-dev"
  tags               = local.tags
  internal           = true
  load_balancer_type = "application"
  security_groups    = [aws_security_group.lb.id]
  subnets            = [aws_subnet.mlflow-dev-service-subnet.id, "subnet-0ae9eae7be10c1603"]
}

resource "aws_lb_target_group" "mlflow" {
  name        = "mlflow-dev"
  port        = local.service_port
  protocol    = "HTTP"
  vpc_id      = "vpc-XXXXXXXXX"
  target_type = "ip"

  health_check {
    protocol = "HTTP"
    matcher  = "200-202"
    path     = "/health"
  }
}

resource "aws_lb_listener" "http" {
  load_balancer_arn = aws_lb.mlflow.arn
  port              = "80"
  protocol          = "HTTP"

  default_action {
    target_group_arn = aws_lb_target_group.mlflow.id
    type             = "forward"
  }
}

resource "aws_eip" "nat_gateway" {
  vpc = true
}

resource "aws_nat_gateway" "nat_gateway" {
  allocation_id = aws_eip.nat_gateway.id
  subnet_id = aws_subnet.mlflow-dev-service-subnet.id
  tags = {
    "Name" = "DevNatGateway"
  }
}

output "nat_gateway_ip" {
  value = aws_eip.nat_gateway.public_ip
}

resource "aws_route_table" "instance" {
  vpc_id = "vpc-XXXXXXXXX"
  route {
    cidr_block = "0.0.0.0/0"
    nat_gateway_id = aws_nat_gateway.nat_gateway.id
  }
}

resource "aws_route_table_association" "instance" {
  subnet_id = aws_subnet.mlflow-dev-service-subnet.id
  route_table_id = aws_route_table.instance.id
}

Every time I deploy the above, the ecs task throws the following error:

CannotPullContainerError: pull image manifest has been retried 5 time(s): failed to resolve ref ghcr.io/mlflow/mlflow:latest: failed to do request: Head "https://ghcr.io/v2/mlflow/mlflow/manifests/latest": dial tcp XXX.XX.XXX.34:443: i/o timeout

I've tried to follow this example for setting up a NAT gateway (https://dev.betterdoc.org/infrastructure/2020/02/04/setting-up-a-nat-gateway-on-aws-using-terraform.html), and I can't figure out where I'm going wrong. Any help would be much appreciated.

1

There are 1 best solutions below

0
Mark B On

You are creating a private subnet to deploy your ECS service, and then you are creating a NAT Gateway inside the same subnet. A NAT Gateway can't work in a private subnet. The NAT Gateway itself must be in a public subnet (a subnet with a route to an Internet Gateway). Your NAT Gateway currently doesn't have a route to the Internet, so it can't forward any outgoing traffic to the Internet.

Your configuration needs to be:

  • Public subnet with a route to an Internet Gateway
  • NAT Gateway inside the public subnet
  • Private subnet with a route to the NAT Gateway
  • ECS service deployed in the private subnet