grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

194 lines
5.2 KiB
HCL

# IAM role for ECS task execution (Fargate)
resource "aws_iam_role" "ecs_task_execution" {
count = var.enable_fargate ? 1 : 0
name = "${local.name_prefix}-ecs-task-execution"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
tags = local.common_tags
}
# Attach the ECS task execution role policy
resource "aws_iam_role_policy_attachment" "ecs_task_execution" {
count = var.enable_fargate ? 1 : 0
role = aws_iam_role.ecs_task_execution[0].name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
# IAM role for ECS tasks (application permissions)
resource "aws_iam_role" "ecs_task" {
count = var.enable_fargate ? 1 : 0
name = "${local.name_prefix}-ecs-task"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
}
]
})
tags = local.common_tags
}
# IAM policy for application services
resource "aws_iam_policy" "meteor_app" {
name = "${local.name_prefix}-app-policy"
description = "IAM policy for Meteor application services"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
# S3 permissions for event storage
{
Effect = "Allow"
Action = [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:ListBucket"
]
Resource = [
aws_s3_bucket.meteor_events.arn,
"${aws_s3_bucket.meteor_events.arn}/*"
]
},
# SQS permissions for message processing
{
Effect = "Allow"
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:SendMessage",
"sqs:GetQueueAttributes",
"sqs:GetQueueUrl"
]
Resource = [
aws_sqs_queue.meteor_processing.arn,
aws_sqs_queue.meteor_processing_dlq.arn
]
},
# CloudWatch permissions for metrics and logs
{
Effect = "Allow"
Action = [
"cloudwatch:PutMetricData",
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:DescribeLogStreams"
]
Resource = "*"
},
# Secrets Manager permissions (if using RDS)
{
Effect = "Allow"
Action = [
"secretsmanager:GetSecretValue"
]
Resource = var.enable_rds ? [aws_secretsmanager_secret.rds_password[0].arn] : []
}
]
})
tags = local.common_tags
}
# Attach the application policy to the ECS task role
resource "aws_iam_role_policy_attachment" "ecs_task_app_policy" {
count = var.enable_fargate ? 1 : 0
role = aws_iam_role.ecs_task[0].name
policy_arn = aws_iam_policy.meteor_app.arn
}
# IAM user for application services (when not using Fargate)
resource "aws_iam_user" "meteor_app" {
count = var.enable_fargate ? 0 : 1
name = "${local.name_prefix}-app-user"
path = "/"
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-app-user"
Description = "IAM user for Meteor application services"
})
}
# Attach policy to IAM user
resource "aws_iam_user_policy_attachment" "meteor_app" {
count = var.enable_fargate ? 0 : 1
user = aws_iam_user.meteor_app[0].name
policy_arn = aws_iam_policy.meteor_app.arn
}
# Access keys for IAM user (when not using Fargate)
resource "aws_iam_access_key" "meteor_app" {
count = var.enable_fargate ? 0 : 1
user = aws_iam_user.meteor_app[0].name
}
# Store access keys in Secrets Manager (when not using Fargate)
resource "aws_secretsmanager_secret" "app_credentials" {
count = var.enable_fargate ? 0 : 1
name = "${local.name_prefix}-app-credentials"
description = "AWS credentials for Meteor application"
tags = local.common_tags
}
resource "aws_secretsmanager_secret_version" "app_credentials" {
count = var.enable_fargate ? 0 : 1
secret_id = aws_secretsmanager_secret.app_credentials[0].id
secret_string = jsonencode({
access_key_id = aws_iam_access_key.meteor_app[0].id
secret_access_key = aws_iam_access_key.meteor_app[0].secret
region = var.aws_region
})
}
# IAM role for Lambda functions (future use)
resource "aws_iam_role" "lambda_execution" {
name = "${local.name_prefix}-lambda-execution"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "lambda.amazonaws.com"
}
}
]
})
tags = local.common_tags
}
# Attach basic Lambda execution policy
resource "aws_iam_role_policy_attachment" "lambda_basic" {
role = aws_iam_role.lambda_execution.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}
# Additional Lambda policy for application resources
resource "aws_iam_role_policy_attachment" "lambda_app_policy" {
role = aws_iam_role.lambda_execution.name
policy_arn = aws_iam_policy.meteor_app.arn
}