Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
486 lines
14 KiB
HCL
486 lines
14 KiB
HCL
# CloudWatch Dashboard for Meteor Application Monitoring
|
|
resource "aws_cloudwatch_dashboard" "meteor_dashboard" {
|
|
dashboard_name = "${local.name_prefix}-monitoring-dashboard"
|
|
|
|
dashboard_body = jsonencode({
|
|
widgets = [
|
|
# Row 1: Application Overview
|
|
{
|
|
type = "metric"
|
|
x = 0
|
|
y = 0
|
|
width = 12
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/WebBackend", "RequestCount", { "stat": "Sum" }],
|
|
[".", "ErrorCount", { "stat": "Sum" }],
|
|
["MeteorApp/ComputeService", "MessageProcessingCount", { "stat": "Sum" }],
|
|
[".", "MessageProcessingError", { "stat": "Sum" }]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Request and Processing Volume"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 12
|
|
y = 0
|
|
width = 12
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/WebBackend", "RequestDuration", { "stat": "Average" }],
|
|
[".", "RequestDuration", { "stat": "p95" }],
|
|
["MeteorApp/ComputeService", "MessageProcessingDuration", { "stat": "Average" }],
|
|
[".", "MessageProcessingDuration", { "stat": "p95" }]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Response Time and Processing Latency"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
# Row 2: Error Rates and Success Metrics
|
|
{
|
|
type = "metric"
|
|
x = 0
|
|
y = 6
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
[{ "expression": "m1/m2*100", "label": "Web Backend Error Rate %" }],
|
|
[{ "expression": "m3/m4*100", "label": "Compute Service Error Rate %" }],
|
|
["MeteorApp/WebBackend", "ErrorCount", { "id": "m1", "visible": false }],
|
|
[".", "RequestCount", { "id": "m2", "visible": false }],
|
|
["MeteorApp/ComputeService", "MessageProcessingError", { "id": "m3", "visible": false }],
|
|
[".", "MessageProcessingCount", { "id": "m4", "visible": false }]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Error Rates"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
max = 100
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 8
|
|
y = 6
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/WebBackend", "AuthOperationCount", "Success", "true"],
|
|
[".", "PaymentOperationCount", "Success", "true"],
|
|
["MeteorApp/ComputeService", "ValidationSuccess"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Successful Operations"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 16
|
|
y = 6
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/ComputeService", "EventsProcessed", { "stat": "Sum" }],
|
|
[".", "ValidationCount", { "stat": "Sum" }],
|
|
["MeteorApp/WebBackend", "EventProcessingCount", { "stat": "Sum" }]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Event Processing Volume"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
# Row 3: Infrastructure Metrics
|
|
{
|
|
type = "metric"
|
|
x = 0
|
|
y = 12
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = concat(
|
|
var.enable_rds ? [
|
|
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", "${local.name_prefix}-postgres"],
|
|
[".", "DatabaseConnections", "DBInstanceIdentifier", "${local.name_prefix}-postgres"]
|
|
] : [],
|
|
[
|
|
# Add external database metrics if available
|
|
]
|
|
)
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Database Performance"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 8
|
|
y = 12
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["AWS/SQS", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing.name],
|
|
[".", "ApproximateAgeOfOldestMessage", "QueueName", aws_sqs_queue.meteor_processing.name],
|
|
[".", "ApproximateNumberOfVisibleMessages", "QueueName", aws_sqs_queue.meteor_processing_dlq.name]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "SQS Queue Metrics"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 16
|
|
y = 12
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = concat(
|
|
var.enable_fargate ? [
|
|
["AWS/ECS", "CPUUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
|
|
[".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-web-backend"],
|
|
[".", "CPUUtilization", "ServiceName", "${local.name_prefix}-compute-service"],
|
|
[".", "MemoryUtilization", "ServiceName", "${local.name_prefix}-compute-service"]
|
|
] : [],
|
|
[
|
|
# Placeholder for external container metrics
|
|
]
|
|
)
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Container Resource Utilization"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
max = 100
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
# Row 4: Business Metrics
|
|
{
|
|
type = "metric"
|
|
x = 0
|
|
y = 18
|
|
width = 12
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/ComputeService", "ValidationDuration", "ProviderName", "classic_cv", { "stat": "Average" }],
|
|
[".", "ValidationDuration", "ProviderName", "mvp", { "stat": "Average" }],
|
|
[".", "ValidationCount", "ProviderName", "classic_cv"],
|
|
[".", "ValidationCount", "ProviderName", "mvp"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Validation Provider Performance"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "metric"
|
|
x = 12
|
|
y = 18
|
|
width = 12
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["MeteorApp/ComputeService", "DatabaseOperationDuration", "Operation", "CreateValidatedEvent"],
|
|
[".", "DatabaseOperationDuration", "Operation", "GetRawEventByID"],
|
|
[".", "DatabaseOperationCount", "Operation", "CreateValidatedEvent"],
|
|
[".", "DatabaseOperationCount", "Operation", "GetRawEventByID"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "Database Operation Performance"
|
|
period = 300
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
# Row 5: Custom Metrics and Alerts
|
|
{
|
|
type = "metric"
|
|
x = 0
|
|
y = 24
|
|
width = 8
|
|
height = 6
|
|
|
|
properties = {
|
|
metrics = [
|
|
["AWS/S3", "BucketSizeBytes", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "StandardStorage"],
|
|
[".", "NumberOfObjects", "BucketName", aws_s3_bucket.meteor_events.bucket, "StorageType", "AllStorageTypes"]
|
|
]
|
|
view = "timeSeries"
|
|
stacked = false
|
|
region = var.aws_region
|
|
title = "S3 Storage Metrics"
|
|
period = 86400 # Daily
|
|
yAxis = {
|
|
left = {
|
|
min = 0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
type = "log"
|
|
x = 8
|
|
y = 24
|
|
width = 16
|
|
height = 6
|
|
|
|
properties = {
|
|
query = "SOURCE '/aws/lambda/${local.name_prefix}' | fields @timestamp, @message | filter @message like /ERROR/ | sort @timestamp desc | limit 20"
|
|
region = var.aws_region
|
|
title = "Recent Error Logs"
|
|
view = "table"
|
|
}
|
|
}
|
|
]
|
|
})
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-dashboard"
|
|
Description = "Comprehensive monitoring dashboard for Meteor application"
|
|
})
|
|
}
|
|
|
|
# CloudWatch Log Groups
|
|
resource "aws_cloudwatch_log_group" "web_backend" {
|
|
name = "/aws/ecs/${local.name_prefix}-web-backend"
|
|
retention_in_days = var.cloudwatch_log_retention_days
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-web-backend-logs"
|
|
Description = "Log group for web backend service"
|
|
})
|
|
}
|
|
|
|
resource "aws_cloudwatch_log_group" "compute_service" {
|
|
name = "/aws/ecs/${local.name_prefix}-compute-service"
|
|
retention_in_days = var.cloudwatch_log_retention_days
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-compute-service-logs"
|
|
Description = "Log group for compute service"
|
|
})
|
|
}
|
|
|
|
# CloudWatch Alarms for Critical System Health
|
|
|
|
# Alarm for NestJS 5xx Error Rate (>1% over 5 minutes)
|
|
resource "aws_cloudwatch_metric_alarm" "nestjs_5xx_error_rate" {
|
|
alarm_name = "${local.name_prefix}-nestjs-5xx-error-rate"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = var.alarm_evaluation_periods
|
|
treat_missing_data = "notBreaching"
|
|
|
|
metric_query {
|
|
id = "e1"
|
|
return_data = false
|
|
|
|
metric {
|
|
metric_name = "ErrorCount"
|
|
namespace = "MeteorApp/WebBackend"
|
|
period = var.alarm_period_seconds
|
|
stat = "Sum"
|
|
}
|
|
}
|
|
|
|
metric_query {
|
|
id = "e2"
|
|
return_data = false
|
|
|
|
metric {
|
|
metric_name = "RequestCount"
|
|
namespace = "MeteorApp/WebBackend"
|
|
period = var.alarm_period_seconds
|
|
stat = "Sum"
|
|
}
|
|
}
|
|
|
|
metric_query {
|
|
id = "e3"
|
|
expression = "SEARCH('{MeteorApp/WebBackend,StatusCode} ErrorCount StatusCode=5*', 'Sum', ${var.alarm_period_seconds})"
|
|
label = "5xx Errors"
|
|
return_data = false
|
|
}
|
|
|
|
metric_query {
|
|
id = "e4"
|
|
expression = "(SUM(e3)/e2)*100"
|
|
label = "5xx Error Rate %"
|
|
return_data = true
|
|
}
|
|
|
|
threshold = var.nestjs_error_rate_threshold
|
|
alarm_description = "CRITICAL: NestJS 5xx error rate exceeds ${var.nestjs_error_rate_threshold}% over 5 minutes. This indicates server errors that require immediate investigation. Check application logs and recent deployments."
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
ok_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-nestjs-5xx-error-rate"
|
|
Severity = "Critical"
|
|
Service = "WebBackend"
|
|
})
|
|
}
|
|
|
|
# Alarm for Go Service Processing Failure Rate (>5% over 5 minutes)
|
|
resource "aws_cloudwatch_metric_alarm" "go_service_failure_rate" {
|
|
alarm_name = "${local.name_prefix}-go-service-failure-rate"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = var.alarm_evaluation_periods
|
|
treat_missing_data = "notBreaching"
|
|
|
|
metric_query {
|
|
id = "e1"
|
|
return_data = false
|
|
|
|
metric {
|
|
metric_name = "MessageProcessingError"
|
|
namespace = "MeteorApp/ComputeService"
|
|
period = var.alarm_period_seconds
|
|
stat = "Sum"
|
|
}
|
|
}
|
|
|
|
metric_query {
|
|
id = "e2"
|
|
return_data = false
|
|
|
|
metric {
|
|
metric_name = "MessageProcessingCount"
|
|
namespace = "MeteorApp/ComputeService"
|
|
period = var.alarm_period_seconds
|
|
stat = "Sum"
|
|
}
|
|
}
|
|
|
|
metric_query {
|
|
id = "e3"
|
|
expression = "(e1/e2)*100"
|
|
label = "Processing Failure Rate %"
|
|
return_data = true
|
|
}
|
|
|
|
threshold = var.go_service_failure_rate_threshold
|
|
alarm_description = "CRITICAL: Go compute service processing failure rate exceeds ${var.go_service_failure_rate_threshold}% over 5 minutes. This indicates message processing issues. Check service logs, SQS dead letter queue, and validation providers."
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
ok_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-go-service-failure-rate"
|
|
Severity = "Critical"
|
|
Service = "ComputeService"
|
|
})
|
|
}
|
|
|
|
# Alarm for SQS Queue Depth (>1000 visible messages)
|
|
resource "aws_cloudwatch_metric_alarm" "sqs_queue_depth" {
|
|
alarm_name = "${local.name_prefix}-sqs-queue-depth"
|
|
comparison_operator = "GreaterThanThreshold"
|
|
evaluation_periods = var.alarm_evaluation_periods
|
|
metric_name = "ApproximateNumberOfVisibleMessages"
|
|
namespace = "AWS/SQS"
|
|
period = var.alarm_period_seconds
|
|
statistic = "Average"
|
|
threshold = var.sqs_queue_depth_threshold
|
|
treat_missing_data = "notBreaching"
|
|
alarm_description = "CRITICAL: SQS queue depth exceeds ${var.sqs_queue_depth_threshold} messages. This indicates message processing backlog. Check compute service health, scaling, and processing capacity."
|
|
alarm_actions = [aws_sns_topic.alerts.arn]
|
|
ok_actions = [aws_sns_topic.alerts.arn]
|
|
|
|
dimensions = {
|
|
QueueName = aws_sqs_queue.meteor_processing.name
|
|
}
|
|
|
|
tags = merge(local.common_tags, {
|
|
Name = "${local.name_prefix}-sqs-queue-depth"
|
|
Severity = "Critical"
|
|
Service = "SQS"
|
|
})
|
|
} |