Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
142 lines
4.0 KiB
HCL
142 lines
4.0 KiB
HCL
# RDS Subnet Group
|
|
resource "aws_db_subnet_group" "meteor" {
|
|
count = var.enable_rds ? 1 : 0
|
|
name = "${local.name_prefix}-db-subnet-group"
|
|
subnet_ids = [aws_subnet.private[0].id, aws_subnet.private[1].id]
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-db-subnet-group"
|
|
})
|
|
}
|
|
|
|
# RDS Security Group
|
|
resource "aws_security_group" "rds" {
|
|
count = var.enable_rds ? 1 : 0
|
|
name = "${local.name_prefix}-rds"
|
|
description = "Security group for RDS PostgreSQL instance"
|
|
vpc_id = aws_vpc.main.id
|
|
|
|
ingress {
|
|
from_port = 5432
|
|
to_port = 5432
|
|
protocol = "tcp"
|
|
security_groups = [aws_security_group.ecs_tasks.id]
|
|
description = "PostgreSQL from ECS tasks"
|
|
}
|
|
|
|
egress {
|
|
from_port = 0
|
|
to_port = 0
|
|
protocol = "-1"
|
|
cidr_blocks = ["0.0.0.0/0"]
|
|
description = "All outbound traffic"
|
|
}
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-rds"
|
|
})
|
|
}
|
|
|
|
# RDS PostgreSQL Instance
|
|
resource "aws_db_instance" "meteor" {
|
|
count = var.enable_rds ? 1 : 0
|
|
|
|
identifier = "${local.name_prefix}-postgres"
|
|
|
|
# Engine settings
|
|
engine = "postgres"
|
|
engine_version = "15.4"
|
|
instance_class = var.rds_instance_class
|
|
|
|
# Storage settings
|
|
allocated_storage = var.rds_allocated_storage
|
|
max_allocated_storage = var.rds_max_allocated_storage
|
|
storage_type = "gp3"
|
|
storage_encrypted = true
|
|
|
|
# Database settings
|
|
db_name = "meteor_${var.environment}"
|
|
username = "meteor_user"
|
|
password = random_password.rds_password[0].result
|
|
|
|
# Network settings
|
|
db_subnet_group_name = aws_db_subnet_group.meteor[0].name
|
|
vpc_security_group_ids = [aws_security_group.rds[0].id]
|
|
publicly_accessible = false
|
|
|
|
# Backup settings
|
|
backup_retention_period = var.environment == "prod" ? 30 : 7
|
|
backup_window = "03:00-04:00"
|
|
maintenance_window = "sun:04:00-sun:05:00"
|
|
auto_minor_version_upgrade = true
|
|
|
|
# Monitoring
|
|
monitoring_interval = var.enable_detailed_monitoring ? 60 : 0
|
|
monitoring_role_arn = var.enable_detailed_monitoring ? aws_iam_role.rds_enhanced_monitoring[0].arn : null
|
|
|
|
# Performance Insights
|
|
performance_insights_enabled = var.environment == "prod"
|
|
|
|
# Deletion protection
|
|
deletion_protection = var.environment == "prod"
|
|
skip_final_snapshot = var.environment != "prod"
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-postgres"
|
|
})
|
|
}
|
|
|
|
# Random password for RDS
|
|
resource "random_password" "rds_password" {
|
|
count = var.enable_rds ? 1 : 0
|
|
length = 32
|
|
special = true
|
|
}
|
|
|
|
# Store RDS password in Secrets Manager
|
|
resource "aws_secretsmanager_secret" "rds_password" {
|
|
count = var.enable_rds ? 1 : 0
|
|
name = "${local.name_prefix}-rds-password"
|
|
description = "RDS PostgreSQL password for meteor application"
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
resource "aws_secretsmanager_secret_version" "rds_password" {
|
|
count = var.enable_rds ? 1 : 0
|
|
secret_id = aws_secretsmanager_secret.rds_password[0].id
|
|
secret_string = jsonencode({
|
|
username = aws_db_instance.meteor[0].username
|
|
password = random_password.rds_password[0].result
|
|
endpoint = aws_db_instance.meteor[0].endpoint
|
|
port = aws_db_instance.meteor[0].port
|
|
dbname = aws_db_instance.meteor[0].db_name
|
|
})
|
|
}
|
|
|
|
# IAM role for RDS enhanced monitoring
|
|
resource "aws_iam_role" "rds_enhanced_monitoring" {
|
|
count = var.enable_rds && var.enable_detailed_monitoring ? 1 : 0
|
|
name = "${local.name_prefix}-rds-enhanced-monitoring"
|
|
|
|
assume_role_policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Action = "sts:AssumeRole"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "monitoring.rds.amazonaws.com"
|
|
}
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
resource "aws_iam_role_policy_attachment" "rds_enhanced_monitoring" {
|
|
count = var.enable_rds && var.enable_detailed_monitoring ? 1 : 0
|
|
role = aws_iam_role.rds_enhanced_monitoring[0].name
|
|
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole"
|
|
} |