diff --git a/CHANGELOG.md b/CHANGELOG.md index 8aa9fc68839..8e1d09cbd68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- **CUMULUS-3759** + - Migrated `tf-modules/cumulus/ecs_cluster` ECS Autoscaling group from launch configurations to launch templates - **CUMULUS-3955** - Removed `VACUUM` statements from db migrations. In cases where the PG database is very large, these queries can take a long time and exceed the Lambda timeout, causing failures on deployment. diff --git a/example/config.yml b/example/config.yml index 118f9e10ea4..5483b0e0847 100644 --- a/example/config.yml +++ b/example/config.yml @@ -8,6 +8,11 @@ cumulus-sit: apiUsername: jasmine pdrNodeNameProviderBucket: cumulus-sit-pdr-node-name-provider +cumulus-std: + bucket: cumulus-sit-internal + apiUsername: jasmine + pdrNodeNameProviderBucket: cumulus-sit-pdr-node-name-provider + cumulus-es: bucket: cumulus-sit-internal apiUsername: jasmine diff --git a/example/cumulus-tf/variables.tf b/example/cumulus-tf/variables.tf index ce4633e9d73..938c83a1193 100644 --- a/example/cumulus-tf/variables.tf +++ b/example/cumulus-tf/variables.tf @@ -107,7 +107,7 @@ variable "data_persistence_remote_state_config" { } variable "s3_replicator_config" { - type = object({ source_bucket = string, source_prefix = string, target_bucket = string, target_prefix = string, target_region = string }) + type = object({ source_bucket = string, source_prefix = string, target_bucket = string, target_prefix = string, target_region = optional(string) }) default = null description = "Configuration for the s3-replicator module. Items with prefix of source_prefix in the source_bucket will be replicated to the target_bucket with target_prefix." } diff --git a/example/deployments/cumulus/cumulus-std.tfvars b/example/deployments/cumulus/cumulus-std.tfvars new file mode 100644 index 00000000000..2296e4a570f --- /dev/null +++ b/example/deployments/cumulus/cumulus-std.tfvars @@ -0,0 +1,65 @@ +prefix = "cumulus-std" + +buckets = { + internal = { + name = "cumulus-sit-internal" + type = "internal" + }, + private = { + name = "cumulus-sit-private" + type = "private" + }, + protected = { + name = "cumulus-sit-protected" + type = "protected" + }, + public = { + name = "cumulus-sit-public" + type = "public" + }, + protected-2 = { + name = "cumulus-sit-protected-2" + type = "protected" + }, + glacier = { + name = "cumulus-sit-orca-glacier" + type = "orca" + }, + dashboard = { + name = "cumulus-sit-dashboard" + type = "dashboard" + } +} + +key_name = "lp" + +oauth_provider = "launchpad" + +saml_entity_id = "https://dashboard.cumulus.sit.earthdata.nasa.gov" +saml_assertion_consumer_service = "https://api.cumulus.sit.earthdata.nasa.gov/saml/auth" +saml_idp_login = "https://auth.launchpad-sbx.nasa.gov/affwebservices/public/saml2sso" +saml_launchpad_metadata_url = "https://auth.launchpad-sbx.nasa.gov/unauth/metadata/launchpad-sbx.idp.xml" + +deploy_cumulus_distribution = false + +archive_api_url = "https://api.cumulus.sit.earthdata.nasa.gov/" +private_archive_api_gateway = true + +# LOG CONFIGURATION (optional) +log_api_gateway_to_cloudwatch = true + +tea_distribution_url = "https://data.cumulus.sit.earthdata.nasa.gov" + +s3_replicator_config = { + source_bucket = "cumulus-std-access-logs" + source_prefix = "s3_access_logs" + target_bucket = "esdis-metrics-inbound-sit-cumulus-std-distribution" + target_prefix = "input/s3_access/cumulus-stdsit" +} + +api_reserved_concurrency = 14 + +lambda_timeouts = { + queue_granules_task_timeout: 900, + discover_granules_task_timeout: 900 +} diff --git a/example/deployments/data-persistence/cumulus-std.tfvars b/example/deployments/data-persistence/cumulus-std.tfvars new file mode 100644 index 00000000000..b530b1d14c5 --- /dev/null +++ b/example/deployments/data-persistence/cumulus-std.tfvars @@ -0,0 +1,10 @@ +prefix = "cumulus-std" + +elasticsearch_config = { + domain_name = "es" + instance_count = 2 + instance_type = "t2.small.elasticsearch" + version = "5.3" + volume_type = "gp2" + volume_size = 10 +} diff --git a/example/fake-provider-cf.yml b/example/fake-provider-cf.yml index eefef25d68a..472d2bcd0d8 100644 --- a/example/fake-provider-cf.yml +++ b/example/fake-provider-cf.yml @@ -92,19 +92,21 @@ Resources: ToPort: 0 VpcId: !Ref VpcId - LaunchConfiguration: - Type: AWS::AutoScaling::LaunchConfiguration + LaunchTemplate: + Type: AWS::EC2::LaunchTemplate Properties: - AssociatePublicIpAddress: false - IamInstanceProfile: !Ref InstanceProfile - ImageId: !Ref LatestAmiId - InstanceMonitoring: false - InstanceType: t3.small - SecurityGroups: - - !Ref SecurityGroup - UserData: - Fn::Base64: - Fn::Sub: | + LaunchTemplateName: "fake-provider-launch-template" + LaunchTemplateData: + IamInstanceProfile: + Arn: !GetAtt InstanceProfile.Arn + ImageId: !Ref LatestAmiId + Monitoring: + Enabled: false + InstanceType: t3.small + SecurityGroupIds: + - !Ref SecurityGroup + UserData: + Fn::Base64: !Sub | #!/bin/bash -ex TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") @@ -280,7 +282,9 @@ Resources: MinInstancesInService: 0 DependsOn: S3ProviderBucket Properties: - LaunchConfigurationName: !Ref LaunchConfiguration + LaunchTemplate: + LaunchTemplateId: !Ref LaunchTemplate + Version: !GetAtt LaunchTemplate.LatestVersionNumber MinSize: "1" DesiredCapacity: "1" MaxSize: "1" diff --git a/tf-modules/cumulus/ecs_cluster.tf b/tf-modules/cumulus/ecs_cluster.tf index 9f733046afd..30be7ea6a44 100644 --- a/tf-modules/cumulus/ecs_cluster.tf +++ b/tf-modules/cumulus/ecs_cluster.tf @@ -220,45 +220,96 @@ data "aws_efs_mount_target" "ecs_cluster_instance" { } locals { - ecs_instance_autoscaling_cf_template_config = { + ecs_instance_autoscaling_user_data_config = { cluster_name = aws_ecs_cluster.default.name container_stop_timeout = var.ecs_container_stop_timeout, docker_hub_config = var.ecs_docker_hub_config, - docker_volume_size = var.ecs_cluster_instance_docker_volume_size, docker_volume_create_size = var.ecs_cluster_instance_docker_volume_size - 1, efs_dns_name = var.ecs_efs_config == null ? null : data.aws_efs_mount_target.ecs_cluster_instance[0].dns_name, efs_mount_point = var.ecs_efs_config == null ? null : var.ecs_efs_config.mount_point, - image_id = var.ecs_cluster_instance_image_id, include_docker_cleanup_cronjob = var.ecs_include_docker_cleanup_cronjob, - instance_profile = aws_iam_instance_profile.ecs_cluster_instance.arn, - instance_type = var.ecs_cluster_instance_type, - key_name = var.key_name, - min_size = var.ecs_cluster_min_size, - desired_capacity = var.ecs_cluster_desired_size, - max_size = var.ecs_cluster_max_size, region = data.aws_region.current.name - security_group_ids = compact(concat( - [ - aws_security_group.ecs_cluster_instance.id, - var.elasticsearch_security_group_id, - var.rds_security_group - ], - var.ecs_custom_sg_ids - )) - subnet_ids = var.ecs_cluster_instance_subnet_ids, task_reaper_object = aws_s3_bucket_object.task_reaper } + + security_group_ids = compact(concat( + [ + aws_security_group.ecs_cluster_instance.id, + var.elasticsearch_security_group_id, + var.rds_security_group + ], + var.ecs_custom_sg_ids + )) +} + +resource "aws_launch_template" "ecs_cluster_instance" { + name_prefix = "${var.prefix}_ecs_cluster_template" + key_name = var.key_name + image_id = var.ecs_cluster_instance_image_id + instance_type = var.ecs_cluster_instance_type + vpc_security_group_ids = local.security_group_ids + block_device_mappings { + device_name = "/dev/xvdcz" + ebs { + delete_on_termination = true + encrypted = true + volume_size = var.ecs_cluster_instance_docker_volume_size + } + } + + iam_instance_profile { + arn = aws_iam_instance_profile.ecs_cluster_instance.arn + } + monitoring { + enabled = true + } + + user_data = base64encode(templatefile( + "${path.module}/ecs_cluster_instance_autoscaling_user_data.tmpl", + local.ecs_instance_autoscaling_user_data_config + )) } -resource "aws_cloudformation_stack" "ecs_instance_autoscaling_group" { - name = "${aws_ecs_cluster.default.name}-autoscaling-group" - template_body = templatefile("${path.module}/ecs_cluster_instance_autoscaling_cf_template.yml.tmpl", local.ecs_instance_autoscaling_cf_template_config) - tags = var.tags +resource "aws_autoscaling_group" "ecs_cluster_instance" { + name_prefix = aws_ecs_cluster.default.name + desired_capacity = var.ecs_cluster_desired_size + max_size = var.ecs_cluster_max_size + min_size = var.ecs_cluster_min_size + vpc_zone_identifier = var.ecs_cluster_instance_subnet_ids + + instance_refresh { + strategy = "Rolling" + preferences { + min_healthy_percentage = 50 + } + } + launch_template { + id = aws_launch_template.ecs_cluster_instance.id + version = aws_launch_template.ecs_cluster_instance.latest_version + } + lifecycle { + create_before_destroy = true + } + + tag { + key = "Name" + value = aws_ecs_cluster.default.name + propagate_at_launch = true + } + + dynamic "tag" { + for_each = var.tags + content { + key = tag.key + propagate_at_launch = true + value = tag.value + } + } } resource "aws_autoscaling_lifecycle_hook" "ecs_instance_termination_hook" { name = "${aws_ecs_cluster.default.name}-ecs-termination-hook" - autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName + autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name default_result = "CONTINUE" heartbeat_timeout = 150 lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING" @@ -267,8 +318,8 @@ resource "aws_autoscaling_lifecycle_hook" "ecs_instance_termination_hook" { # Scale in config resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_in" { - name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-scale-in" - autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName + name = "${aws_autoscaling_group.ecs_cluster_instance.name}-scale-in" + autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name adjustment_type = "PercentChangeInCapacity" metric_aggregation_type = "Average" policy_type = "StepScaling" @@ -280,7 +331,7 @@ resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_in" { } resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale_in_alarm" { - alarm_name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-cpu-scale-in" + alarm_name = "${aws_autoscaling_group.ecs_cluster_instance.name}-cpu-scale-in" comparison_operator = "LessThanThreshold" alarm_actions = [aws_autoscaling_policy.ecs_instance_autoscaling_group_scale_in.arn] datapoints_to_alarm = 1 @@ -298,8 +349,8 @@ resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale # Scale out config resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_out" { - name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-scale-out" - autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName + name = "${aws_autoscaling_group.ecs_cluster_instance.name}-scale-out" + autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name adjustment_type = "PercentChangeInCapacity" metric_aggregation_type = "Average" policy_type = "StepScaling" @@ -312,7 +363,7 @@ resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_out" { } resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale_out_alarm" { - alarm_name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-cpu-scale-out" + alarm_name = "${aws_autoscaling_group.ecs_cluster_instance.name}-cpu-scale-out" comparison_operator = "GreaterThanThreshold" alarm_actions = [aws_autoscaling_policy.ecs_instance_autoscaling_group_scale_out.arn] datapoints_to_alarm = 1 diff --git a/tf-modules/cumulus/ecs_cluster_instance_autoscaling_cf_template.yml.tmpl b/tf-modules/cumulus/ecs_cluster_instance_autoscaling_cf_template.yml.tmpl deleted file mode 100644 index e443a608900..00000000000 --- a/tf-modules/cumulus/ecs_cluster_instance_autoscaling_cf_template.yml.tmpl +++ /dev/null @@ -1,123 +0,0 @@ -Resources: - LaunchConfiguration: - Type: AWS::AutoScaling::LaunchConfiguration - Properties: - SecurityGroups: -%{ for s in security_group_ids ~} - - ${s} -%{ endfor ~} - ImageId: ${image_id} - InstanceType: ${instance_type} - IamInstanceProfile: ${instance_profile} - BlockDeviceMappings: - - DeviceName: "/dev/xvdcz" - Ebs: - DeleteOnTermination: true - Encrypted: true - VolumeSize: ${docker_volume_size} -%{ if key_name != null ~} - KeyName: ${key_name} -%{ endif ~} - UserData: - Fn::Base64: | - Content-Type: multipart/mixed; boundary="==BOUNDARY==" - MIME-Version: 1.0 - - --==BOUNDARY== - Content-Type: text/cloud-boothook; charset="us-ascii" - - #!/bin/bash - - if ! rpm -q lvm2 >/dev/null 2>&1; then - yum install -y lvm2 - fi - - vgcreate docker /dev/xvdcz - - lvcreate -n docker-data -L${docker_volume_create_size}G docker - - mkfs.xfs /dev/docker/docker-data - mkdir /docker-data - mount /dev/docker/docker-data /docker-data - - sed -i '/^\s*DOCKER_STORAGE_OPTIONS=/d' /etc/sysconfig/docker-storage - echo 'DOCKER_STORAGE_OPTIONS="--storage-driver overlay2"' >> /etc/sysconfig/docker-storage - - sed -i '/^\s*OPTIONS=/d' /etc/sysconfig/docker - echo 'OPTIONS="--default-ulimit nofile=1024:4096 --data-root=/docker-data"' >> /etc/sysconfig/docker - -%{ if include_docker_cleanup_cronjob == true ~} - echo '* * * * * sudo sh -c "docker ps -q | xargs docker inspect --format='\{{.State.Pid}}' | xargs -IZ fstrim /proc/Z/root/"' | crontab - -%{ endif ~} - - --==BOUNDARY== - Content-Type: text/x-shellscript; charset="us-ascii" - - #!/bin/bash -%{ if efs_dns_name != null && efs_mount_point != null ~} - TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") - AZ=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone) - - if ! rpm -q nfs-utils >/dev/null 2>&1; then - yum install -y nfs-utils - fi - - mkdir -p ${efs_mount_point} - mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ${efs_dns_name}:/ ${efs_mount_point} - chmod 777 ${efs_mount_point} - - service docker restart - -%{ endif ~} - cat <<'EOF' >> /etc/ecs/ecs.config - ECS_CLUSTER=${cluster_name} - ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=1m - ECS_CONTAINER_STOP_TIMEOUT=${container_stop_timeout} - EOF - -%{ if docker_hub_config != null ~} - echo ECS_ENGINE_AUTH_TYPE=docker >> /etc/ecs/ecs.config - echo 'ECS_ENGINE_AUTH_DATA={"https://index.docker.io/v1/":{"username":"${docker_hub_config.username}","password": "${docker_hub_config.password}","email":"${docker_hub_config.email}"}}' >> /etc/ecs/ecs.config - -%{ endif ~} - if ! which aws >/dev/null 2>&1; then - yum install -y jq unzip - - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - ./aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli --update - rm -rf ./aws awscliv2.zip - fi - - aws s3 cp s3://${task_reaper_object.bucket}/${task_reaper_object.key} /usr/local/bin/task-reaper.sh - chmod +x /usr/local/bin/task-reaper.sh - cat <<'EOF' >> /etc/cron.d/task-reaper - PATH=/bin:/usr/local/bin - AWS_DEFAULT_REGION=${region} - LIFECYCLE_HOOK_NAME=${cluster_name}-ecs-termination-hook - * * * * * root /usr/local/bin/task-reaper.sh >> /var/log/task-reaper.log 2>&1 - EOF - - --==BOUNDARY==-- - - AutoScalingGroup: - Type: AWS::AutoScaling::AutoScalingGroup - UpdatePolicy: - AutoScalingRollingUpdate: - MinInstancesInService: ${min_size} - Properties: - VPCZoneIdentifier: -%{ for s in subnet_ids ~} - - ${s} -%{ endfor ~} - LaunchConfigurationName: !Ref LaunchConfiguration - MinSize: ${min_size} - DesiredCapacity: ${desired_capacity} - MaxSize: ${max_size} - Tags: - - Key: Name - Value: ${cluster_name} - PropagateAtLaunch: true -Outputs: - AutoscalingGroupName: - Value: !Ref AutoScalingGroup diff --git a/tf-modules/cumulus/ecs_cluster_instance_autoscaling_user_data.tmpl b/tf-modules/cumulus/ecs_cluster_instance_autoscaling_user_data.tmpl new file mode 100644 index 00000000000..ef3e0eaa7b7 --- /dev/null +++ b/tf-modules/cumulus/ecs_cluster_instance_autoscaling_user_data.tmpl @@ -0,0 +1,79 @@ +Content-Type: multipart/mixed; boundary="==BOUNDARY==" +MIME-Version: 1.0 + +--==BOUNDARY== +Content-Type: text/cloud-boothook; charset="us-ascii" + +#!/bin/bash + +if ! rpm -q lvm2 >/dev/null 2>&1; then + yum install -y lvm2 +fi + +vgcreate docker /dev/xvdcz + +lvcreate -n docker-data -L${docker_volume_create_size}G docker + +mkfs.xfs /dev/docker/docker-data +mkdir /docker-data +mount /dev/docker/docker-data /docker-data + +sed -i '/^\s*DOCKER_STORAGE_OPTIONS=/d' /etc/sysconfig/docker-storage +echo 'DOCKER_STORAGE_OPTIONS="--storage-driver overlay2"' >> /etc/sysconfig/docker-storage + +sed -i '/^\s*OPTIONS=/d' /etc/sysconfig/docker +echo 'OPTIONS="--default-ulimit nofile=1024:4096 --data-root=/docker-data"' >> /etc/sysconfig/docker + +%{ if include_docker_cleanup_cronjob == true ~} + echo '* * * * * sudo sh -c "docker ps -q | xargs docker inspect --format='\{{.State.Pid}}' | xargs -IZ fstrim /proc/Z/root/"' | crontab - +%{ endif ~} + +--==BOUNDARY== +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +%{ if efs_dns_name != null && efs_mount_point != null ~} + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + AZ=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone) + + if ! rpm -q nfs-utils >/dev/null 2>&1; then + yum install -y nfs-utils + fi + + mkdir -p ${efs_mount_point} + mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ${efs_dns_name}:/ ${efs_mount_point} + chmod 777 ${efs_mount_point} + + service docker restart + +%{ endif ~} +cat <<'EOF' >> /etc/ecs/ecs.config +ECS_CLUSTER=${cluster_name} +ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=1m +ECS_CONTAINER_STOP_TIMEOUT=${container_stop_timeout} +EOF + +%{ if docker_hub_config != null ~} + echo ECS_ENGINE_AUTH_TYPE=docker >> /etc/ecs/ecs.config + echo 'ECS_ENGINE_AUTH_DATA={"https://index.docker.io/v1/":{"username":"${docker_hub_config.username}","password": "${docker_hub_config.password}","email":"${docker_hub_config.email}"}}' >> /etc/ecs/ecs.config + +%{ endif ~} +if ! which aws >/dev/null 2>&1; then + yum install -y jq unzip + + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + ./aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli --update + rm -rf ./aws awscliv2.zip +fi + +aws s3 cp s3://${task_reaper_object.bucket}/${task_reaper_object.key} /usr/local/bin/task-reaper.sh +chmod +x /usr/local/bin/task-reaper.sh +cat <<'EOF' >> /etc/cron.d/task-reaper +PATH=/bin:/usr/local/bin +AWS_DEFAULT_REGION=${region} +LIFECYCLE_HOOK_NAME=${cluster_name}-ecs-termination-hook +* * * * * root /usr/local/bin/task-reaper.sh >> /var/log/task-reaper.log 2>&1 +EOF + +--==BOUNDARY==--