Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
48 lines
1.6 KiB
Plaintext
48 lines
1.6 KiB
Plaintext
# AWS Configuration
|
|
aws_region = "us-east-1"
|
|
|
|
# Environment Configuration
|
|
environment = "dev"
|
|
project_name = "meteor"
|
|
|
|
# S3 Configuration
|
|
s3_bucket_versioning = true
|
|
s3_bucket_force_destroy = true # Set to false for production
|
|
|
|
# SQS Configuration
|
|
sqs_visibility_timeout_seconds = 300
|
|
sqs_message_retention_seconds = 1209600 # 14 days
|
|
sqs_max_receive_count = 3
|
|
|
|
# RDS Configuration (set enable_rds = true to create RDS instance)
|
|
enable_rds = false
|
|
rds_instance_class = "db.t3.micro"
|
|
rds_allocated_storage = 20
|
|
rds_max_allocated_storage = 100
|
|
|
|
# ECS/Fargate Configuration (set enable_fargate = true to create VPC and ECS resources)
|
|
enable_fargate = false
|
|
web_backend_cpu = 256
|
|
web_backend_memory = 512
|
|
compute_service_cpu = 256
|
|
compute_service_memory = 512
|
|
|
|
# Monitoring Configuration
|
|
cloudwatch_log_retention_days = 14
|
|
enable_detailed_monitoring = true
|
|
|
|
# Alerting Configuration
|
|
alert_email = "your-email@example.com" # REQUIRED: Email address to receive alerts
|
|
nestjs_error_rate_threshold = 1.0 # Percentage (1% = 1.0)
|
|
go_service_failure_rate_threshold = 5.0 # Percentage (5% = 5.0)
|
|
sqs_queue_depth_threshold = 1000 # Number of visible messages
|
|
alarm_evaluation_periods = 1 # Number of periods to evaluate
|
|
alarm_period_seconds = 300 # 5 minutes
|
|
|
|
# Example for production:
|
|
# environment = "prod"
|
|
# s3_bucket_force_destroy = false
|
|
# enable_rds = true
|
|
# rds_instance_class = "db.t3.small"
|
|
# enable_fargate = true
|
|
# cloudwatch_log_retention_days = 30 |