Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
93 lines
2.9 KiB
HCL
93 lines
2.9 KiB
HCL
# SQS Queue for meteor event processing
|
|
resource "aws_sqs_queue" "meteor_processing" {
|
|
name = "${local.name_prefix}-processing"
|
|
visibility_timeout_seconds = var.sqs_visibility_timeout_seconds
|
|
message_retention_seconds = var.sqs_message_retention_seconds
|
|
receive_wait_time_seconds = 20 # Enable long polling
|
|
|
|
# Dead letter queue configuration
|
|
redrive_policy = jsonencode({
|
|
deadLetterTargetArn = aws_sqs_queue.meteor_processing_dlq.arn
|
|
maxReceiveCount = var.sqs_max_receive_count
|
|
})
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-processing"
|
|
Description = "Queue for processing meteor events"
|
|
})
|
|
}
|
|
|
|
# Dead Letter Queue for failed messages
|
|
resource "aws_sqs_queue" "meteor_processing_dlq" {
|
|
name = "${local.name_prefix}-processing-dlq"
|
|
message_retention_seconds = var.sqs_message_retention_seconds
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-processing-dlq"
|
|
Description = "Dead letter queue for failed meteor event processing"
|
|
})
|
|
}
|
|
|
|
# SQS Queue policy to allow S3 to send messages
|
|
resource "aws_sqs_queue_policy" "meteor_processing_s3" {
|
|
queue_url = aws_sqs_queue.meteor_processing.id
|
|
|
|
policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Sid = "AllowS3ToSendMessage"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "s3.amazonaws.com"
|
|
}
|
|
Action = "sqs:SendMessage"
|
|
Resource = aws_sqs_queue.meteor_processing.arn
|
|
Condition = {
|
|
ArnEquals = {
|
|
"aws:SourceArn" = aws_s3_bucket.meteor_events.arn
|
|
}
|
|
}
|
|
}
|
|
]
|
|
})
|
|
}
|
|
|
|
# CloudWatch Alarms for SQS monitoring
|
|
resource "aws_cloudwatch_metric_alarm" "sqs_message_age" {
|
|
alarm_name = "${local.name_prefix}-sqs-message-age"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = "2"
|
|
metric_name = "ApproximateAgeOfOldestMessage"
|
|
namespace = "AWS/SQS"
|
|
period = "300"
|
|
statistic = "Maximum"
|
|
threshold = "900" # 15 minutes
|
|
alarm_description = "This metric monitors message age in SQS queue"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
QueueName = aws_sqs_queue.meteor_processing.name
|
|
}
|
|
|
|
tags = local.common_tags
|
|
}
|
|
|
|
resource "aws_cloudwatch_metric_alarm" "sqs_dlq_messages" {
|
|
alarm_name = "${local.name_prefix}-sqs-dlq-messages"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = "1"
|
|
metric_name = "ApproximateNumberOfVisibleMessages"
|
|
namespace = "AWS/SQS"
|
|
period = "300"
|
|
statistic = "Sum"
|
|
threshold = "0"
|
|
alarm_description = "This metric monitors messages in dead letter queue"
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
QueueName = aws_sqs_queue.meteor_processing_dlq.name
|
|
}
|
|
|
|
tags = local.common_tags
|
|
} |