Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
51 lines
1.3 KiB
HCL
51 lines
1.3 KiB
HCL
# SNS Topic for Alerts
|
|
resource "aws_sns_topic" "alerts" {
|
|
name = "${var.project_name}-${var.environment}-alerts"
|
|
|
|
tags = {
|
|
Name = "${var.project_name}-${var.environment}-alerts"
|
|
Environment = var.environment
|
|
Project = var.project_name
|
|
Purpose = "System monitoring alerts"
|
|
}
|
|
}
|
|
|
|
# SNS Topic Policy to allow CloudWatch to publish
|
|
resource "aws_sns_topic_policy" "alerts_policy" {
|
|
arn = aws_sns_topic.alerts.arn
|
|
|
|
policy = jsonencode({
|
|
Version = "2012-10-17"
|
|
Statement = [
|
|
{
|
|
Sid = "AllowCloudWatchAlarmsToPublish"
|
|
Effect = "Allow"
|
|
Principal = {
|
|
Service = "cloudwatch.amazonaws.com"
|
|
}
|
|
Action = [
|
|
"SNS:Publish"
|
|
]
|
|
Resource = aws_sns_topic.alerts.arn
|
|
Condition = {
|
|
StringEquals = {
|
|
"aws:SourceAccount" = data.aws_caller_identity.current.account_id
|
|
}
|
|
}
|
|
}
|
|
]
|
|
})
|
|
}
|
|
|
|
# Email Subscription (requires manual confirmation)
|
|
resource "aws_sns_topic_subscription" "email_alerts" {
|
|
count = var.alert_email != "" ? 1 : 0
|
|
topic_arn = aws_sns_topic.alerts.arn
|
|
protocol = "email"
|
|
endpoint = var.alert_email
|
|
|
|
depends_on = [aws_sns_topic.alerts]
|
|
}
|
|
|
|
# Data source to get current AWS account ID
|
|
data "aws_caller_identity" "current" {} |