Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUMULUS-3759:Migrated ECS Autoscaling group from launch configurations to launch templates #3880

Merged
merged 16 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- **CUMULUS-3759**
- Migrated `tf-modules/cumulus/ecs_cluster` ECS Autoscaling group from launch configurations to launch templates
- **CUMULUS-3955**
- Removed `VACUUM` statements from db migrations. In cases where the PG database is very large, these queries
can take a long time and exceed the Lambda timeout, causing failures on deployment.
Expand Down
5 changes: 5 additions & 0 deletions example/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ cumulus-sit:
apiUsername: jasmine
pdrNodeNameProviderBucket: cumulus-sit-pdr-node-name-provider

cumulus-std:
bucket: cumulus-sit-internal
apiUsername: jasmine
pdrNodeNameProviderBucket: cumulus-sit-pdr-node-name-provider

cumulus-es:
bucket: cumulus-sit-internal
apiUsername: jasmine
Expand Down
2 changes: 1 addition & 1 deletion example/cumulus-tf/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ variable "data_persistence_remote_state_config" {
}

variable "s3_replicator_config" {
type = object({ source_bucket = string, source_prefix = string, target_bucket = string, target_prefix = string, target_region = string })
type = object({ source_bucket = string, source_prefix = string, target_bucket = string, target_prefix = string, target_region = optional(string) })
jennyhliu marked this conversation as resolved.
Show resolved Hide resolved
default = null
description = "Configuration for the s3-replicator module. Items with prefix of source_prefix in the source_bucket will be replicated to the target_bucket with target_prefix."
}
Expand Down
65 changes: 65 additions & 0 deletions example/deployments/cumulus/cumulus-std.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
prefix = "cumulus-std"

buckets = {
internal = {
name = "cumulus-sit-internal"
type = "internal"
},
private = {
name = "cumulus-sit-private"
type = "private"
},
protected = {
name = "cumulus-sit-protected"
type = "protected"
},
public = {
name = "cumulus-sit-public"
type = "public"
},
protected-2 = {
name = "cumulus-sit-protected-2"
type = "protected"
},
glacier = {
name = "cumulus-sit-orca-glacier"
type = "orca"
},
dashboard = {
name = "cumulus-sit-dashboard"
type = "dashboard"
}
}

key_name = "lp"

oauth_provider = "launchpad"

saml_entity_id = "https://dashboard.cumulus.sit.earthdata.nasa.gov"
saml_assertion_consumer_service = "https://api.cumulus.sit.earthdata.nasa.gov/saml/auth"
saml_idp_login = "https://auth.launchpad-sbx.nasa.gov/affwebservices/public/saml2sso"
saml_launchpad_metadata_url = "https://auth.launchpad-sbx.nasa.gov/unauth/metadata/launchpad-sbx.idp.xml"

deploy_cumulus_distribution = false

archive_api_url = "https://api.cumulus.sit.earthdata.nasa.gov/"
private_archive_api_gateway = true

# LOG CONFIGURATION (optional)
log_api_gateway_to_cloudwatch = true

tea_distribution_url = "https://data.cumulus.sit.earthdata.nasa.gov"

s3_replicator_config = {
source_bucket = "cumulus-std-access-logs"
source_prefix = "s3_access_logs"
target_bucket = "esdis-metrics-inbound-sit-cumulus-std-distribution"
target_prefix = "input/s3_access/cumulus-stdsit"
}

api_reserved_concurrency = 14

lambda_timeouts = {
queue_granules_task_timeout: 900,
discover_granules_task_timeout: 900
}
10 changes: 10 additions & 0 deletions example/deployments/data-persistence/cumulus-std.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
prefix = "cumulus-std"

elasticsearch_config = {
domain_name = "es"
instance_count = 2
instance_type = "t2.small.elasticsearch"
version = "5.3"
volume_type = "gp2"
volume_size = 10
}
30 changes: 17 additions & 13 deletions example/fake-provider-cf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,21 @@ Resources:
ToPort: 0
VpcId: !Ref VpcId

LaunchConfiguration:
Type: AWS::AutoScaling::LaunchConfiguration
LaunchTemplate:
Type: AWS::EC2::LaunchTemplate
Properties:
AssociatePublicIpAddress: false
IamInstanceProfile: !Ref InstanceProfile
ImageId: !Ref LatestAmiId
InstanceMonitoring: false
InstanceType: t3.small
SecurityGroups:
- !Ref SecurityGroup
UserData:
Fn::Base64:
Fn::Sub: |
LaunchTemplateName: "fake-provider-launch-template"
Nnaga1 marked this conversation as resolved.
Show resolved Hide resolved
LaunchTemplateData:
IamInstanceProfile:
Arn: !GetAtt InstanceProfile.Arn
ImageId: !Ref LatestAmiId
Monitoring:
Enabled: false
InstanceType: t3.small
SecurityGroupIds:
- !Ref SecurityGroup
UserData:
Fn::Base64: !Sub |
#!/bin/bash -ex

TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
Expand Down Expand Up @@ -280,7 +282,9 @@ Resources:
MinInstancesInService: 0
DependsOn: S3ProviderBucket
Properties:
LaunchConfigurationName: !Ref LaunchConfiguration
LaunchTemplate:
LaunchTemplateId: !Ref LaunchTemplate
Version: !GetAtt LaunchTemplate.LatestVersionNumber
MinSize: "1"
DesiredCapacity: "1"
MaxSize: "1"
Expand Down
109 changes: 80 additions & 29 deletions tf-modules/cumulus/ecs_cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -220,45 +220,96 @@ data "aws_efs_mount_target" "ecs_cluster_instance" {
}

locals {
ecs_instance_autoscaling_cf_template_config = {
ecs_instance_autoscaling_user_data_config = {
cluster_name = aws_ecs_cluster.default.name
container_stop_timeout = var.ecs_container_stop_timeout,
docker_hub_config = var.ecs_docker_hub_config,
docker_volume_size = var.ecs_cluster_instance_docker_volume_size,
docker_volume_create_size = var.ecs_cluster_instance_docker_volume_size - 1,
efs_dns_name = var.ecs_efs_config == null ? null : data.aws_efs_mount_target.ecs_cluster_instance[0].dns_name,
efs_mount_point = var.ecs_efs_config == null ? null : var.ecs_efs_config.mount_point,
image_id = var.ecs_cluster_instance_image_id,
include_docker_cleanup_cronjob = var.ecs_include_docker_cleanup_cronjob,
instance_profile = aws_iam_instance_profile.ecs_cluster_instance.arn,
instance_type = var.ecs_cluster_instance_type,
key_name = var.key_name,
min_size = var.ecs_cluster_min_size,
desired_capacity = var.ecs_cluster_desired_size,
max_size = var.ecs_cluster_max_size,
region = data.aws_region.current.name
security_group_ids = compact(concat(
[
aws_security_group.ecs_cluster_instance.id,
var.elasticsearch_security_group_id,
var.rds_security_group
],
var.ecs_custom_sg_ids
))
subnet_ids = var.ecs_cluster_instance_subnet_ids,
task_reaper_object = aws_s3_bucket_object.task_reaper
}

security_group_ids = compact(concat(
[
jennyhliu marked this conversation as resolved.
Show resolved Hide resolved
aws_security_group.ecs_cluster_instance.id,
var.elasticsearch_security_group_id,
var.rds_security_group
],
var.ecs_custom_sg_ids
))
}

resource "aws_launch_template" "ecs_cluster_instance" {
name_prefix = "${var.prefix}_ecs_cluster_template"
key_name = var.key_name
image_id = var.ecs_cluster_instance_image_id
instance_type = var.ecs_cluster_instance_type
vpc_security_group_ids = local.security_group_ids
block_device_mappings {
Nnaga1 marked this conversation as resolved.
Show resolved Hide resolved
device_name = "/dev/xvdcz"
ebs {
delete_on_termination = true
encrypted = true
volume_size = var.ecs_cluster_instance_docker_volume_size
}
}

iam_instance_profile {
arn = aws_iam_instance_profile.ecs_cluster_instance.arn
}
monitoring {
enabled = true
}

user_data = base64encode(templatefile(
jennyhliu marked this conversation as resolved.
Show resolved Hide resolved
"${path.module}/ecs_cluster_instance_autoscaling_user_data.tmpl",
local.ecs_instance_autoscaling_user_data_config
))
}

resource "aws_cloudformation_stack" "ecs_instance_autoscaling_group" {
name = "${aws_ecs_cluster.default.name}-autoscaling-group"
template_body = templatefile("${path.module}/ecs_cluster_instance_autoscaling_cf_template.yml.tmpl", local.ecs_instance_autoscaling_cf_template_config)
tags = var.tags
resource "aws_autoscaling_group" "ecs_cluster_instance" {
name_prefix = aws_ecs_cluster.default.name
desired_capacity = var.ecs_cluster_desired_size
max_size = var.ecs_cluster_max_size
min_size = var.ecs_cluster_min_size
vpc_zone_identifier = var.ecs_cluster_instance_subnet_ids

instance_refresh {
jennyhliu marked this conversation as resolved.
Show resolved Hide resolved
strategy = "Rolling"
preferences {
min_healthy_percentage = 50
}
}
launch_template {
id = aws_launch_template.ecs_cluster_instance.id
version = aws_launch_template.ecs_cluster_instance.latest_version
}
lifecycle {
create_before_destroy = true
}

tag {
key = "Name"
value = aws_ecs_cluster.default.name
propagate_at_launch = true
}

dynamic "tag" {
for_each = var.tags
content {
key = tag.key
propagate_at_launch = true
value = tag.value
}
}
}

resource "aws_autoscaling_lifecycle_hook" "ecs_instance_termination_hook" {
name = "${aws_ecs_cluster.default.name}-ecs-termination-hook"
autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName
autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name
default_result = "CONTINUE"
heartbeat_timeout = 150
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
Expand All @@ -267,8 +318,8 @@ resource "aws_autoscaling_lifecycle_hook" "ecs_instance_termination_hook" {
# Scale in config

resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_in" {
name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-scale-in"
autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName
name = "${aws_autoscaling_group.ecs_cluster_instance.name}-scale-in"
autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name
adjustment_type = "PercentChangeInCapacity"
metric_aggregation_type = "Average"
policy_type = "StepScaling"
Expand All @@ -280,7 +331,7 @@ resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_in" {
}

resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale_in_alarm" {
alarm_name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-cpu-scale-in"
alarm_name = "${aws_autoscaling_group.ecs_cluster_instance.name}-cpu-scale-in"
comparison_operator = "LessThanThreshold"
alarm_actions = [aws_autoscaling_policy.ecs_instance_autoscaling_group_scale_in.arn]
datapoints_to_alarm = 1
Expand All @@ -298,8 +349,8 @@ resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale
# Scale out config

resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_out" {
name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-scale-out"
autoscaling_group_name = aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName
name = "${aws_autoscaling_group.ecs_cluster_instance.name}-scale-out"
autoscaling_group_name = aws_autoscaling_group.ecs_cluster_instance.name
adjustment_type = "PercentChangeInCapacity"
metric_aggregation_type = "Average"
policy_type = "StepScaling"
Expand All @@ -312,7 +363,7 @@ resource "aws_autoscaling_policy" "ecs_instance_autoscaling_group_scale_out" {
}

resource "aws_cloudwatch_metric_alarm" "ecs_instance_autoscaling_group_cpu_scale_out_alarm" {
alarm_name = "${aws_cloudformation_stack.ecs_instance_autoscaling_group.outputs.AutoscalingGroupName}-cpu-scale-out"
alarm_name = "${aws_autoscaling_group.ecs_cluster_instance.name}-cpu-scale-out"
comparison_operator = "GreaterThanThreshold"
alarm_actions = [aws_autoscaling_policy.ecs_instance_autoscaling_group_scale_out.arn]
datapoints_to_alarm = 1
Expand Down
Loading