grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

486 lines
14 KiB
HCL

# CloudWatch Dashboard for Meteor Application Monitoring
resource "aws_cloudwatch_dashboard" "meteor_dashboard" {
dashboard_name = "${local.name_prefix}-monitoring-dashboard"
dashboard_body = jsonencode({
widgets = [
# Row 1: Application Overview
{
type = "metric"
x = 0
y = 0
width = 12
height = 6
properties = {
metrics = [
["MeteorApp/WebBackend", "RequestCount", { "stat": "Sum" }],
[".", "ErrorCount", { "stat": "Sum" }],
["MeteorApp/ComputeService", "MessageProcessingCount", { "stat": "Sum" }],
[".", "MessageProcessingError", { "stat": "Sum" }]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Request and Processing Volume"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "metric"
x = 12
y = 0
width = 12
height = 6
properties = {
metrics = [
["MeteorApp/WebBackend", "RequestDuration", { "stat": "Average" }],
[".", "RequestDuration", { "stat": "p95" }],
["MeteorApp/ComputeService", "MessageProcessingDuration", { "stat": "Average" }],
[".", "MessageProcessingDuration", { "stat": "p95" }]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Response Time and Processing Latency"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
# Row 2: Error Rates and Success Metrics
{
type = "metric"
x = 0
y = 6
width = 8
height = 6
properties = {
metrics = [
[{ "expression": "m1/m2*100", "label": "Web Backend Error Rate %" }],
[{ "expression": "m3/m4*100", "label": "Compute Service Error Rate %" }],
["MeteorApp/WebBackend", "ErrorCount", { "id": "m1", "visible": false }],
[".", "RequestCount", { "id": "m2", "visible": false }],
["MeteorApp/ComputeService", "MessageProcessingError", { "id": "m3", "visible": false }],
[".", "MessageProcessingCount", { "id": "m4", "visible": false }]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Error Rates"
period = 300
yAxis = {
left = {
min = 0
max = 100
}
}
}
},
{
type = "metric"
x = 8
y = 6
width = 8
height = 6
properties = {
metrics = [
["MeteorApp/WebBackend", "AuthOperationCount", "Success", "true"],
[".", "PaymentOperationCount", "Success", "true"],
["MeteorApp/ComputeService", "ValidationSuccess"]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Successful Operations"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "metric"
x = 16
y = 6
width = 8
height = 6
properties = {
metrics = [
["MeteorApp/ComputeService", "EventsProcessed", { "stat": "Sum" }],
[".", "ValidationCount", { "stat": "Sum" }],
["MeteorApp/WebBackend", "EventProcessingCount", { "stat": "Sum" }]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Event Processing Volume"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
# Row 3: Infrastructure Metrics
{
type = "metric"
x = 0
y = 12
width = 8
height = 6
properties = {
metrics = concat(
var.enable_rds ? [
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", "${local.name_prefix}-postgres"],
[".", "DatabaseConnections", "DBInstanceIdentifier", "${local.name_prefix}-postgres"]
] : [],
[
# Add external database metrics if available
]
)
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Database Performance"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "metric"
x = 8
y = 12
width = 8
height = 6
properties = {
metrics = [
["AWS/SQS", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing.name],
[".", "ApproximateAgeOfOldestMessage", "QueueName", aws_sqs_queue.meteor_processing.name],
[".", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing_dlq.name]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "SQS Queue Metrics"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "metric"
x = 16
y = 12
width = 8
height = 6
properties = {
metrics = concat(
var.enable_fargate ? [
["AWS/ECS", "CPUUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
[".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
[".", "CPUUtilization", "ServiceName", "${local.name_prefix}-compute-service"],
[".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-compute-service"]
] : [],
[
# Placeholder for external container metrics
]
)
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Container Resource Utilization"
period = 300
yAxis = {
left = {
min = 0
max = 100
}
}
}
},
# Row 4: Business Metrics
{
type = "metric"
x = 0
y = 18
width = 12
height = 6
properties = {
metrics = [
["MeteorApp/ComputeService", "ValidationDuration", "ProviderName", "classic_cv", { "stat": "Average" }],
[".", "ValidationDuration", "ProviderName", "mvp", { "stat": "Average" }],
[".", "ValidationCount", "ProviderName", "classic_cv"],
[".", "ValidationCount", "ProviderName", "mvp"]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Validation Provider Performance"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "metric"
x = 12
y = 18
width = 12
height = 6
properties = {
metrics = [
["MeteorApp/ComputeService", "DatabaseOperationDuration", "Operation", "CreateValidatedEvent"],
[".", "DatabaseOperationDuration", "Operation", "GetRawEventByID"],
[".", "DatabaseOperationCount", "Operation", "CreateValidatedEvent"],
[".", "DatabaseOperationCount", "Operation", "GetRawEventByID"]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "Database Operation Performance"
period = 300
yAxis = {
left = {
min = 0
}
}
}
},
# Row 5: Custom Metrics and Alerts
{
type = "metric"
x = 0
y = 24
width = 8
height = 6
properties = {
metrics = [
["AWS/S3", "BucketSizeBytes", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "StandardStorage"],
[".", "NumberOfObjects", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "AllStorageTypes"]
]
view = "timeSeries"
stacked = false
region = var.aws_region
title = "S3 Storage Metrics"
period = 86400 # Daily
yAxis = {
left = {
min = 0
}
}
}
},
{
type = "log"
x = 8
y = 24
width = 16
height = 6
properties = {
query = "SOURCE '/aws/lambda/${local.name_prefix}' | fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20"
region = var.aws_region
title = "Recent Error Logs"
view = "table"
}
}
]
})
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-dashboard"
Description = "Comprehensive monitoring dashboard for Meteor application"
})
}
# CloudWatch Log Groups
resource "aws_cloudwatch_log_group" "web_backend" {
name = "/aws/ecs/${local.name_prefix}-web-backend"
retention_in_days = var.cloudwatch_log_retention_days
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-web-backend-logs"
Description = "Log group for web backend service"
})
}
resource "aws_cloudwatch_log_group" "compute_service" {
name = "/aws/ecs/${local.name_prefix}-compute-service"
retention_in_days = var.cloudwatch_log_retention_days
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-compute-service-logs"
Description = "Log group for compute service"
})
}
# CloudWatch Alarms for Critical System Health
# Alarm for NestJS 5xx Error Rate (>1% over 5 minutes)
resource "aws_cloudwatch_metric_alarm" "nestjs_5xx_error_rate" {
alarm_name = "${local.name_prefix}-nestjs-5xx-error-rate"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.alarm_evaluation_periods
treat_missing_data = "notBreaching"
metric_query {
id = "e1"
return_data = false
metric {
metric_name = "ErrorCount"
namespace = "MeteorApp/WebBackend"
period = var.alarm_period_seconds
stat = "Sum"
}
}
metric_query {
id = "e2"
return_data = false
metric {
metric_name = "RequestCount"
namespace = "MeteorApp/WebBackend"
period = var.alarm_period_seconds
stat = "Sum"
}
}
metric_query {
id = "e3"
expression = "SEARCH('{MeteorApp/WebBackend,StatusCode} ErrorCount StatusCode=5*', 'Sum', ${var.alarm_period_seconds})"
label = "5xx Errors"
return_data = false
}
metric_query {
id = "e4"
expression = "(SUM(e3)/e2)*100"
label = "5xx Error Rate %"
return_data = true
}
threshold = var.nestjs_error_rate_threshold
alarm_description = "CRITICAL: NestJS 5xx error rate exceeds ${var.nestjs_error_rate_threshold}% over 5 minutes. This indicates server errors that require immediate investigation. Check application logs and recent deployments."
alarm_actions = [aws_sns_topic.alerts.arn]
ok_actions = [aws_sns_topic.alerts.arn]
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-nestjs-5xx-error-rate"
Severity = "Critical"
Service = "WebBackend"
})
}
# Alarm for Go Service Processing Failure Rate (>5% over 5 minutes)
resource "aws_cloudwatch_metric_alarm" "go_service_failure_rate" {
alarm_name = "${local.name_prefix}-go-service-failure-rate"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.alarm_evaluation_periods
treat_missing_data = "notBreaching"
metric_query {
id = "e1"
return_data = false
metric {
metric_name = "MessageProcessingError"
namespace = "MeteorApp/ComputeService"
period = var.alarm_period_seconds
stat = "Sum"
}
}
metric_query {
id = "e2"
return_data = false
metric {
metric_name = "MessageProcessingCount"
namespace = "MeteorApp/ComputeService"
period = var.alarm_period_seconds
stat = "Sum"
}
}
metric_query {
id = "e3"
expression = "(e1/e2)*100"
label = "Processing Failure Rate %"
return_data = true
}
threshold = var.go_service_failure_rate_threshold
alarm_description = "CRITICAL: Go compute service processing failure rate exceeds ${var.go_service_failure_rate_threshold}% over 5 minutes. This indicates message processing issues. Check service logs, SQS dead letter queue, and validation providers."
alarm_actions = [aws_sns_topic.alerts.arn]
ok_actions = [aws_sns_topic.alerts.arn]
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-go-service-failure-rate"
Severity = "Critical"
Service = "ComputeService"
})
}
# Alarm for SQS Queue Depth (>1000 visible messages)
resource "aws_cloudwatch_metric_alarm" "sqs_queue_depth" {
alarm_name = "${local.name_prefix}-sqs-queue-depth"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = var.alarm_evaluation_periods
metric_name = "ApproximateNumberOfVisibleMessages"
namespace = "AWS/SQS"
period = var.alarm_period_seconds
statistic = "Average"
threshold = var.sqs_queue_depth_threshold
treat_missing_data = "notBreaching"
alarm_description = "CRITICAL: SQS queue depth exceeds ${var.sqs_queue_depth_threshold} messages. This indicates message processing backlog. Check compute service health, scaling, and processing capacity."
alarm_actions = [aws_sns_topic.alerts.arn]
ok_actions = [aws_sns_topic.alerts.arn]
dimensions = {
QueueName = aws_sqs_queue.meteor_processing.name
}
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-sqs-queue-depth"
Severity = "Critical"
Service = "SQS"
})
}