I have an environment setup with Terraform. One of our SecOps team needs SonarQube CE installed for automatic scans, smells, vuln checks. So I have it running in AWS, over our VPN, DNS resolves to an internal facing ALB that points traffic to a target group of instances that make up the ECS Cluster. There is a launch config and auto scaling group as well. Only running one container per host currently would like to set this to 2-4 at some point.
The problem I am having is the instances are registering to the target group on both the container port:9000 and the dynamic ephemeral port, 32768. The health checks to the dynamic port are working fine, however the health checks to port 9000 are failing. This is causing the instances to cycle between initial, unhealthy, and terminating repeatedly. Other than this annoying issue the application runs fine. RDS connects, we can use SonarQube just fine.
I've tried removing the references to the container port in Terraform, I'll also state that this is an extremely secure environment. All egress traffic from any VPC is filtered out through a McAffee Cloud Proxy applicance. When I first stood this up in a sandbox account, with egress to 0.0.0.0/0 everything worked fine. I've spent a few hours on this now, at the point of scratching my head.
Hopefully someone else has been here and will share their insight. Tomorrow is a new day after all. HELP!
ERROR Message when I remove the port from the target group
aws_lb_target_group.ecs: port should be set when target type is instance
ERROR Message when I set the port to 0
aws_ecs_service.ecs: InvalidParameterException: The container sonarqube did not have a container port 0 defined.
ERROR Message when I set the container port to 0 in the taskdef.
aws_ecs_task_definition.task: ClientException: Invalid 'containerPort' setting for container 'sonarqube'
ecs-taskdef.tf
resource "aws_ecs_task_definition" "task" {
family = "${var.name}-${var.env}"
network_mode = "bridge"
cpu = 8192
memory = 16384
execution_role_arn = "${var.ecs-exec-role}"
container_definitions = <<DEFINITION
[
{
"name": "${var.name}",
"image":"${var.image}",
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/ecs/${var.cluster_name}-${var.name}",
"awslogs-region": "${var.region}",
"awslogs-stream-prefix": "ecs"
}
},
"portMappings": [
{
"containerPort": 9000
}
],
"environment": [
{
"name": "sonar.jdbc.password",
"value": "${var.password}"
},
{
"name": "sonar.jdbc.url",
"value": "jdbc:mysql://${var.rds_url}:${var.port}/sonar?useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true&useConfigs=maxPerformance"
},
{
"name": "sonar.jdbc.username",
"value": "${var.username}"
}
]
}
]
DEFINITION
}
resource "aws_ecs_service" "ecs" {
name = "${var.name}-${var.env}"
cluster = "${var.cluster_name}"
task_definition = "${aws_ecs_task_definition.task.arn}"
scheduling_strategy = "DAEMON"
lifecycle {
ignore_changes = ["desired_count"]
}
load_balancer {
target_group_arn = "${aws_lb_target_group.ecs.arn}"
container_name = "${var.name}"
container_port = 9000 #Removed & Terraform complains with an error.
}
}
elb.tf
resource "aws_lb" "ecs" {
name = "${var.name_prefix}-${var.name}-tf"
internal = true
load_balancer_type = "application"
security_groups = ["${var.security_groups}"]
subnets = ["${var.subnets}"]
enable_deletion_protection = false
tags = "${merge(var.tags, map("Name", "${var.name_prefix}-${var.name}-elb"))}"
}
resource "aws_lb_listener" "ecs" {
load_balancer_arn = "${aws_lb.ecs.arn}"
port = 80
protocol = "HTTP"
default_action {
type = "redirect"
redirect {
port = "443"
protocol = "HTTPS"
status_code = "HTTP_301"
}
}
}
resource "aws_lb_listener" "ssl" {
load_balancer_arn = "${aws_lb.ecs.arn}"
port = 443
protocol = "HTTPS"
lifecycle {
create_before_destroy = true
}
ssl_policy = "ELBSecurityPolicy-2016-08"
certificate_arn = "arn:aws:acm:REDACTED"
default_action {
type = "forward"
target_group_arn = "${aws_lb_target_group.ecs.arn}"
}
}
resource "aws_lb_target_group" "ecs" {
name = "${var.cluster_name}"
protocol = "HTTP"
port = 9000 #must be here or TF errors instance type must have port
vpc_id = "${var.vpc_id}"
lifecycle {
create_before_destroy = true
}
}
ec2.tf
resource "aws_autoscaling_group" "asg" {
availability_zones = ["${var.region}a", "${var.region}b", "${var.region}d"]
name = "${var.name}-${var.env}-asg"
max_size = "${var.min_size}"
min_size = "${var.max_size}"
health_check_grace_period = 300
health_check_type = "ELB"
desired_capacity = "${var.desired_size}"
launch_configuration = "${aws_launch_configuration.alc.name}"
vpc_zone_identifier = ["${var.subnet_ids}"]
target_group_arns = ["${var.target_arn}"]
lifecycle {
create_before_destroy = true
}
tag {
key = "Environment"
value = "${var.name}"
propagate_at_launch = true
}
tag {
key = "Name"
value = "${var.name_prefix}-${var.name}.ecs"
propagate_at_launch = true
}
}
resource "aws_launch_configuration" "alc" {
name_prefix = "${var.name_prefix}.ecs"
image_id = "${lookup(var.ecs-images, var.region)}"
instance_type = "${var.instance_type}"
iam_instance_profile = "${aws_iam_instance_profile.ecs-instance-profile.arn}"
user_data = "${data.template_file.userdata.rendered}"
key_name = "${var.key_name}"
security_groups = ["${var.security_groups}"]
lifecycle {
create_before_destroy = true
}
root_block_device {
volume_type = "io1"
iops = "1000"
volume_size = "${var.volume_size}"
}
}
data "template_file" "userdata" {
template = "${file("${path.module}/userdata/ecs-instances.sh")}"
vars {
cluster-name = "${aws_ecs_cluster.cluster.name}"
}
}
resource "aws_security_group" "allow_all_from_cluster" {
name = "${var.name_prefix}-${var.name}-ecs-cluster"
description = "Allow traffic from cluster"
vpc_id = "${var.vpc_id}"
tags = "${merge(var.tags, map("Name", "${var.name_prefix}-${var.name}-sg"))}"
lifecycle {
create_before_destroy = true
}
ingress { #open to VPC IP's
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["172.27.160.0/22"]
}
ingress { #open to corp network redirected to 443
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
ingress { #http access for corp users
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
egress { #open to VPC IP's
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["172.27.160.0/22"]
}
egress { #ephemeral response to corp users
from_port = 32768
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
}
iam.tf
resource "aws_iam_role" "iam_role" {
name = "${var.name}-ecs-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ecs.amazonaws.com"
},
"Action": "sts:AssumeRole"
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
resource "aws_iam_policy" "efs-policy" {
name = "${var.env}-efs-access-policy"
path = "/"
description = "Allow ${var.env} cluster access to EFS"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"elasticfilesystem:*"
],
"Effect": "Allow",
"Resource": "*"
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "ecs-service-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceRole"
}
resource "aws_iam_role_policy_attachment" "ecs-service-for-ec2-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_iam_role_policy_attachment" "ssm-service-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM"
}
resource "aws_iam_role_policy_attachment" "efs-for-ec2-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "${aws_iam_policy.efs-policy.arn}"
}
resource "aws_iam_instance_profile" "ecs-instance-profile" {
name = "${var.env}-ecs"
role = "${aws_iam_role.iam_role.name}"
}
Expected health checks to only take place on dynamic port. I can remove the intance from the target group on port 9000. Each instance shows up in the registered targets secion twice, for both ports. I remove the port 9000 and the instances stays in service.