grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

135 lines
4.4 KiB
HCL

output "s3_bucket_name" {
description = "Name of the S3 bucket for meteor events"
value = aws_s3_bucket.meteor_events.id
}
output "s3_bucket_arn" {
description = "ARN of the S3 bucket for meteor events"
value = aws_s3_bucket.meteor_events.arn
}
output "sqs_queue_url" {
description = "URL of the SQS queue for processing"
value = aws_sqs_queue.meteor_processing.url
}
output "sqs_queue_arn" {
description = "ARN of the SQS queue for processing"
value = aws_sqs_queue.meteor_processing.arn
}
output "sqs_dlq_url" {
description = "URL of the SQS dead letter queue"
value = aws_sqs_queue.meteor_processing_dlq.url
}
output "cloudwatch_dashboard_url" {
description = "URL to access the CloudWatch dashboard"
value = "https://${var.aws_region}.console.aws.amazon.com/cloudwatch/home?region=${var.aws_region}#dashboards:name=${aws_cloudwatch_dashboard.meteor_dashboard.dashboard_name}"
}
output "cloudwatch_log_groups" {
description = "CloudWatch log groups created"
value = {
web_backend = aws_cloudwatch_log_group.web_backend.name
compute_service = aws_cloudwatch_log_group.compute_service.name
}
}
# Alerting outputs
output "sns_alerts_topic_arn" {
description = "ARN of the SNS topic for alerts"
value = aws_sns_topic.alerts.arn
}
output "critical_alarms" {
description = "Critical CloudWatch alarms created"
value = {
nestjs_error_rate = aws_cloudwatch_metric_alarm.nestjs_5xx_error_rate.alarm_name
go_service_failure = aws_cloudwatch_metric_alarm.go_service_failure_rate.alarm_name
sqs_queue_depth = aws_cloudwatch_metric_alarm.sqs_queue_depth.alarm_name
}
}
# RDS outputs (when enabled)
output "rds_endpoint" {
description = "RDS instance endpoint"
value = var.enable_rds ? aws_db_instance.meteor[0].endpoint : null
sensitive = true
}
output "rds_database_name" {
description = "RDS database name"
value = var.enable_rds ? aws_db_instance.meteor[0].db_name : null
}
output "rds_secret_arn" {
description = "ARN of the secret containing RDS credentials"
value = var.enable_rds ? aws_secretsmanager_secret.rds_password[0].arn : null
}
# IAM outputs
output "iam_policy_arn" {
description = "ARN of the IAM policy for application services"
value = aws_iam_policy.meteor_app.arn
}
output "ecs_task_role_arn" {
description = "ARN of the ECS task role (when using Fargate)"
value = var.enable_fargate ? aws_iam_role.ecs_task[0].arn : null
}
output "ecs_execution_role_arn" {
description = "ARN of the ECS execution role (when using Fargate)"
value = var.enable_fargate ? aws_iam_role.ecs_task_execution[0].arn : null
}
output "app_credentials_secret_arn" {
description = "ARN of the secret containing app credentials (when not using Fargate)"
value = var.enable_fargate ? null : aws_secretsmanager_secret.app_credentials[0].arn
sensitive = true
}
# VPC outputs (when using Fargate)
output "vpc_id" {
description = "ID of the VPC"
value = var.enable_fargate ? aws_vpc.main[0].id : null
}
output "private_subnet_ids" {
description = "IDs of the private subnets"
value = var.enable_fargate ? aws_subnet.private[*].id : null
}
output "public_subnet_ids" {
description = "IDs of the public subnets"
value = var.enable_fargate ? aws_subnet.public[*].id : null
}
output "security_group_ecs_tasks" {
description = "ID of the security group for ECS tasks"
value = var.enable_fargate ? aws_security_group.ecs_tasks[0].id : null
}
# Environment configuration for applications
output "environment_variables" {
description = "Environment variables for application configuration"
value = {
AWS_REGION = var.aws_region
AWS_S3_BUCKET_NAME = aws_s3_bucket.meteor_events.id
AWS_SQS_QUEUE_URL = aws_sqs_queue.meteor_processing.url
ENVIRONMENT = var.environment
}
}
# Configuration snippet for docker-compose or deployment
output "docker_environment" {
description = "Environment variables formatted for Docker deployment"
value = {
AWS_REGION = var.aws_region
AWS_S3_BUCKET_NAME = aws_s3_bucket.meteor_events.id
AWS_SQS_QUEUE_URL = aws_sqs_queue.meteor_processing.url
DATABASE_URL = var.enable_rds ? "postgresql://${aws_db_instance.meteor[0].username}:${random_password.rds_password[0].result}@${aws_db_instance.meteor[0].endpoint}:${aws_db_instance.meteor[0].port}/${aws_db_instance.meteor[0].db_name}" : null
}
sensitive = true
}