Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
194 lines
5.2 KiB
HCL
194 lines
5.2 KiB
HCL
# IAM role for ECS task execution (Fargate)
|
|
resource "aws_iam_role" "ecs_task_execution" {
|
|
count = var.enable_fargate ? 1 : 0
|
|
name = "${local.name_prefix}-ecs-task-execution"
|
|
|
|
assume_role_policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Action = "sts:AssumeRole"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "ecs-tasks.amazonaws.com"
|
|
}
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
# Attach the ECS task execution role policy
|
|
resource "aws_iam_role_policy_attachment" "ecs_task_execution" {
|
|
count = var.enable_fargate ? 1 : 0
|
|
role = aws_iam_role.ecs_task_execution[0].name
|
|
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
|
|
}
|
|
|
|
# IAM role for ECS tasks (application permissions)
|
|
resource "aws_iam_role" "ecs_task" {
|
|
count = var.enable_fargate ? 1 : 0
|
|
name = "${local.name_prefix}-ecs-task"
|
|
|
|
assume_role_policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Action = "sts:AssumeRole"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "ecs-tasks.amazonaws.com"
|
|
}
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
# IAM policy for application services
|
|
resource "aws_iam_policy" "meteor_app" {
|
|
name = "${local.name_prefix}-app-policy"
|
|
description = "IAM policy for Meteor application services"
|
|
|
|
policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
# S3 permissions for event storage
|
|
{
|
|
Effect = "Allow"
|
|
Action = [
|
|
"s3:GetObject",
|
|
"s3:PutObject",
|
|
"s3:DeleteObject",
|
|
"s3:ListBucket"
|
|
]
|
|
Resource = [
|
|
aws_s3_bucket.meteor_events.arn,
|
|
"${aws_s3_bucket.meteor_events.arn}/*"
|
|
]
|
|
},
|
|
# SQS permissions for message processing
|
|
{
|
|
Effect = "Allow"
|
|
Action = [
|
|
"sqs:ReceiveMessage",
|
|
"sqs:DeleteMessage",
|
|
"sqs:SendMessage",
|
|
"sqs:GetQueueAttributes",
|
|
"sqs:GetQueueUrl"
|
|
]
|
|
Resource = [
|
|
aws_sqs_queue.meteor_processing.arn,
|
|
aws_sqs_queue.meteor_processing_dlq.arn
|
|
]
|
|
},
|
|
# CloudWatch permissions for metrics and logs
|
|
{
|
|
Effect = "Allow"
|
|
Action = [
|
|
"cloudwatch:PutMetricData",
|
|
"logs:CreateLogGroup",
|
|
"logs:CreateLogStream",
|
|
"logs:PutLogEvents",
|
|
"logs:DescribeLogStreams"
|
|
]
|
|
Resource = "*"
|
|
},
|
|
# Secrets Manager permissions (if using RDS)
|
|
{
|
|
Effect = "Allow"
|
|
Action = [
|
|
"secretsmanager:GetSecretValue"
|
|
]
|
|
Resource = var.enable_rds ? [aws_secretsmanager_secret.rds_password[0].arn] : []
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
# Attach the application policy to the ECS task role
|
|
resource "aws_iam_role_policy_attachment" "ecs_task_app_policy" {
|
|
count = var.enable_fargate ? 1 : 0
|
|
role = aws_iam_role.ecs_task[0].name
|
|
policy_arn = aws_iam_policy.meteor_app.arn
|
|
}
|
|
|
|
# IAM user for application services (when not using Fargate)
|
|
resource "aws_iam_user" "meteor_app" {
|
|
count = var.enable_fargate ? 0 : 1
|
|
name = "${local.name_prefix}-app-user"
|
|
path = "/"
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-app-user"
|
|
Description = "IAM user for Meteor application services"
|
|
})
|
|
}
|
|
|
|
# Attach policy to IAM user
|
|
resource "aws_iam_user_policy_attachment" "meteor_app" {
|
|
count = var.enable_fargate ? 0 : 1
|
|
user = aws_iam_user.meteor_app[0].name
|
|
policy_arn = aws_iam_policy.meteor_app.arn
|
|
}
|
|
|
|
# Access keys for IAM user (when not using Fargate)
|
|
resource "aws_iam_access_key" "meteor_app" {
|
|
count = var.enable_fargate ? 0 : 1
|
|
user = aws_iam_user.meteor_app[0].name
|
|
}
|
|
|
|
# Store access keys in Secrets Manager (when not using Fargate)
|
|
resource "aws_secretsmanager_secret" "app_credentials" {
|
|
count = var.enable_fargate ? 0 : 1
|
|
name = "${local.name_prefix}-app-credentials"
|
|
description = "AWS credentials for Meteor application"
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
resource "aws_secretsmanager_secret_version" "app_credentials" {
|
|
count = var.enable_fargate ? 0 : 1
|
|
secret_id = aws_secretsmanager_secret.app_credentials[0].id
|
|
secret_string = jsonencode({
|
|
access_key_id = aws_iam_access_key.meteor_app[0].id
|
|
secret_access_key = aws_iam_access_key.meteor_app[0].secret
|
|
region = var.aws_region
|
|
})
|
|
}
|
|
|
|
# IAM role for Lambda functions (future use)
|
|
resource "aws_iam_role" "lambda_execution" {
|
|
name = "${local.name_prefix}-lambda-execution"
|
|
|
|
assume_role_policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Action = "sts:AssumeRole"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "lambda.amazonaws.com"
|
|
}
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
# Attach basic Lambda execution policy
|
|
resource "aws_iam_role_policy_attachment" "lambda_basic" {
|
|
role = aws_iam_role.lambda_execution.name
|
|
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
|
|
}
|
|
|
|
# Additional Lambda policy for application resources
|
|
resource "aws_iam_role_policy_attachment" "lambda_app_policy" {
|
|
role = aws_iam_role.lambda_execution.name
|
|
policy_arn = aws_iam_policy.meteor_app.arn
|
|
} |