# CloudWatch Dashboard for Meteor Application Monitoring resource "aws_cloudwatch_dashboard" "meteor_dashboard" { dashboard_name = "${local.name_prefix}-monitoring-dashboard" dashboard_body = jsonencode({ widgets = [ # Row 1: Application Overview { type = "metric" x = 0 y = 0 width = 12 height = 6 properties = { metrics = [ ["MeteorApp/WebBackend", "RequestCount", { "stat": "Sum" }], [".", "ErrorCount", { "stat": "Sum" }], ["MeteorApp/ComputeService", "MessageProcessingCount", { "stat": "Sum" }], [".", "MessageProcessingError", { "stat": "Sum" }] ] view = "timeSeries" stacked = false region = var.aws_region title = "Request and Processing Volume" period = 300 yAxis = { left = { min = 0 } } } }, { type = "metric" x = 12 y = 0 width = 12 height = 6 properties = { metrics = [ ["MeteorApp/WebBackend", "RequestDuration", { "stat": "Average" }], [".", "RequestDuration", { "stat": "p95" }], ["MeteorApp/ComputeService", "MessageProcessingDuration", { "stat": "Average" }], [".", "MessageProcessingDuration", { "stat": "p95" }] ] view = "timeSeries" stacked = false region = var.aws_region title = "Response Time and Processing Latency" period = 300 yAxis = { left = { min = 0 } } } }, # Row 2: Error Rates and Success Metrics { type = "metric" x = 0 y = 6 width = 8 height = 6 properties = { metrics = [ [{ "expression": "m1/m2*100", "label": "Web Backend Error Rate %" }], [{ "expression": "m3/m4*100", "label": "Compute Service Error Rate %" }], ["MeteorApp/WebBackend", "ErrorCount", { "id": "m1", "visible": false }], [".", "RequestCount", { "id": "m2", "visible": false }], ["MeteorApp/ComputeService", "MessageProcessingError", { "id": "m3", "visible": false }], [".", "MessageProcessingCount", { "id": "m4", "visible": false }] ] view = "timeSeries" stacked = false region = var.aws_region title = "Error Rates" period = 300 yAxis = { left = { min = 0 max = 100 } } } }, { type = "metric" x = 8 y = 6 width = 8 height = 6 properties = { metrics = [ ["MeteorApp/WebBackend", "AuthOperationCount", "Success", "true"], [".", "PaymentOperationCount", "Success", "true"], ["MeteorApp/ComputeService", "ValidationSuccess"] ] view = "timeSeries" stacked = false region = var.aws_region title = "Successful Operations" period = 300 yAxis = { left = { min = 0 } } } }, { type = "metric" x = 16 y = 6 width = 8 height = 6 properties = { metrics = [ ["MeteorApp/ComputeService", "EventsProcessed", { "stat": "Sum" }], [".", "ValidationCount", { "stat": "Sum" }], ["MeteorApp/WebBackend", "EventProcessingCount", { "stat": "Sum" }] ] view = "timeSeries" stacked = false region = var.aws_region title = "Event Processing Volume" period = 300 yAxis = { left = { min = 0 } } } }, # Row 3: Infrastructure Metrics { type = "metric" x = 0 y = 12 width = 8 height = 6 properties = { metrics = concat( var.enable_rds ? [ ["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", "${local.name_prefix}-postgres"], [".", "DatabaseConnections", "DBInstanceIdentifier", "${local.name_prefix}-postgres"] ] : [], [ # Add external database metrics if available ] ) view = "timeSeries" stacked = false region = var.aws_region title = "Database Performance" period = 300 yAxis = { left = { min = 0 } } } }, { type = "metric" x = 8 y = 12 width = 8 height = 6 properties = { metrics = [ ["AWS/SQS", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing.name], [".", "ApproximateAgeOfOldestMessage", "QueueName", aws_sqs_queue.meteor_processing.name], [".", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing_dlq.name] ] view = "timeSeries" stacked = false region = var.aws_region title = "SQS Queue Metrics" period = 300 yAxis = { left = { min = 0 } } } }, { type = "metric" x = 16 y = 12 width = 8 height = 6 properties = { metrics = concat( var.enable_fargate ? [ ["AWS/ECS", "CPUUtilization", "ServiceName", "${local.name_prefix}-web-backend"], [".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-web-backend"], [".", "CPUUtilization", "ServiceName", "${local.name_prefix}-compute-service"], [".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-compute-service"] ] : [], [ # Placeholder for external container metrics ] ) view = "timeSeries" stacked = false region = var.aws_region title = "Container Resource Utilization" period = 300 yAxis = { left = { min = 0 max = 100 } } } }, # Row 4: Business Metrics { type = "metric" x = 0 y = 18 width = 12 height = 6 properties = { metrics = [ ["MeteorApp/ComputeService", "ValidationDuration", "ProviderName", "classic_cv", { "stat": "Average" }], [".", "ValidationDuration", "ProviderName", "mvp", { "stat": "Average" }], [".", "ValidationCount", "ProviderName", "classic_cv"], [".", "ValidationCount", "ProviderName", "mvp"] ] view = "timeSeries" stacked = false region = var.aws_region title = "Validation Provider Performance" period = 300 yAxis = { left = { min = 0 } } } }, { type = "metric" x = 12 y = 18 width = 12 height = 6 properties = { metrics = [ ["MeteorApp/ComputeService", "DatabaseOperationDuration", "Operation", "CreateValidatedEvent"], [".", "DatabaseOperationDuration", "Operation", "GetRawEventByID"], [".", "DatabaseOperationCount", "Operation", "CreateValidatedEvent"], [".", "DatabaseOperationCount", "Operation", "GetRawEventByID"] ] view = "timeSeries" stacked = false region = var.aws_region title = "Database Operation Performance" period = 300 yAxis = { left = { min = 0 } } } }, # Row 5: Custom Metrics and Alerts { type = "metric" x = 0 y = 24 width = 8 height = 6 properties = { metrics = [ ["AWS/S3", "BucketSizeBytes", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "StandardStorage"], [".", "NumberOfObjects", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "AllStorageTypes"] ] view = "timeSeries" stacked = false region = var.aws_region title = "S3 Storage Metrics" period = 86400 # Daily yAxis = { left = { min = 0 } } } }, { type = "log" x = 8 y = 24 width = 16 height = 6 properties = { query = "SOURCE '/aws/lambda/${local.name_prefix}' | fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20" region = var.aws_region title = "Recent Error Logs" view = "table" } } ] }) tags = merge(local.common_tags, { Name = "${local.name_prefix}-dashboard" Description = "Comprehensive monitoring dashboard for Meteor application" }) } # CloudWatch Log Groups resource "aws_cloudwatch_log_group" "web_backend" { name = "/aws/ecs/${local.name_prefix}-web-backend" retention_in_days = var.cloudwatch_log_retention_days tags = merge(local.common_tags, { Name = "${local.name_prefix}-web-backend-logs" Description = "Log group for web backend service" }) } resource "aws_cloudwatch_log_group" "compute_service" { name = "/aws/ecs/${local.name_prefix}-compute-service" retention_in_days = var.cloudwatch_log_retention_days tags = merge(local.common_tags, { Name = "${local.name_prefix}-compute-service-logs" Description = "Log group for compute service" }) } # CloudWatch Alarms for Critical System Health # Alarm for NestJS 5xx Error Rate (>1% over 5 minutes) resource "aws_cloudwatch_metric_alarm" "nestjs_5xx_error_rate" { alarm_name = "${local.name_prefix}-nestjs-5xx-error-rate" comparison_operator = "GreaterThanThreshold" evaluation_periods = var.alarm_evaluation_periods treat_missing_data = "notBreaching" metric_query { id = "e1" return_data = false metric { metric_name = "ErrorCount" namespace = "MeteorApp/WebBackend" period = var.alarm_period_seconds stat = "Sum" } } metric_query { id = "e2" return_data = false metric { metric_name = "RequestCount" namespace = "MeteorApp/WebBackend" period = var.alarm_period_seconds stat = "Sum" } } metric_query { id = "e3" expression = "SEARCH('{MeteorApp/WebBackend,StatusCode} ErrorCount StatusCode=5*', 'Sum', ${var.alarm_period_seconds})" label = "5xx Errors" return_data = false } metric_query { id = "e4" expression = "(SUM(e3)/e2)*100" label = "5xx Error Rate %" return_data = true } threshold = var.nestjs_error_rate_threshold alarm_description = "CRITICAL: NestJS 5xx error rate exceeds ${var.nestjs_error_rate_threshold}% over 5 minutes. This indicates server errors that require immediate investigation. Check application logs and recent deployments." alarm_actions = [aws_sns_topic.alerts.arn] ok_actions = [aws_sns_topic.alerts.arn] tags = merge(local.common_tags, { Name = "${local.name_prefix}-nestjs-5xx-error-rate" Severity = "Critical" Service = "WebBackend" }) } # Alarm for Go Service Processing Failure Rate (>5% over 5 minutes) resource "aws_cloudwatch_metric_alarm" "go_service_failure_rate" { alarm_name = "${local.name_prefix}-go-service-failure-rate" comparison_operator = "GreaterThanThreshold" evaluation_periods = var.alarm_evaluation_periods treat_missing_data = "notBreaching" metric_query { id = "e1" return_data = false metric { metric_name = "MessageProcessingError" namespace = "MeteorApp/ComputeService" period = var.alarm_period_seconds stat = "Sum" } } metric_query { id = "e2" return_data = false metric { metric_name = "MessageProcessingCount" namespace = "MeteorApp/ComputeService" period = var.alarm_period_seconds stat = "Sum" } } metric_query { id = "e3" expression = "(e1/e2)*100" label = "Processing Failure Rate %" return_data = true } threshold = var.go_service_failure_rate_threshold alarm_description = "CRITICAL: Go compute service processing failure rate exceeds ${var.go_service_failure_rate_threshold}% over 5 minutes. This indicates message processing issues. Check service logs, SQS dead letter queue, and validation providers." alarm_actions = [aws_sns_topic.alerts.arn] ok_actions = [aws_sns_topic.alerts.arn] tags = merge(local.common_tags, { Name = "${local.name_prefix}-go-service-failure-rate" Severity = "Critical" Service = "ComputeService" }) } # Alarm for SQS Queue Depth (>1000 visible messages) resource "aws_cloudwatch_metric_alarm" "sqs_queue_depth" { alarm_name = "${local.name_prefix}-sqs-queue-depth" comparison_operator = "GreaterThanThreshold" evaluation_periods = var.alarm_evaluation_periods metric_name = "ApproximateNumberOfVisibleMessages" namespace = "AWS/SQS" period = var.alarm_period_seconds statistic = "Average" threshold = var.sqs_queue_depth_threshold treat_missing_data = "notBreaching" alarm_description = "CRITICAL: SQS queue depth exceeds ${var.sqs_queue_depth_threshold} messages. This indicates message processing backlog. Check compute service health, scaling, and processing capacity." alarm_actions = [aws_sns_topic.alerts.arn] ok_actions = [aws_sns_topic.alerts.arn] dimensions = { QueueName = aws_sqs_queue.meteor_processing.name } tags = merge(local.common_tags, { Name = "${local.name_prefix}-sqs-queue-depth" Severity = "Critical" Service = "SQS" }) }