Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3: **Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)** - Instrumented NestJS web backend with CloudWatch metrics integration using prom-client - Instrumented Go compute service with structured CloudWatch metrics reporting - Created comprehensive Terraform infrastructure from scratch with modular design - Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics - Added proper error categorization and provider performance tracking **Story 3.6: 关键故障告警 (Critical System Alerts)** - Implemented SNS-based alerting infrastructure via Terraform - Created critical alarms for NestJS 5xx error rate (>1% threshold) - Created Go service processing failure rate alarm (>5% threshold) - Created SQS queue depth alarm (>1000 messages threshold) - Added actionable alarm descriptions with investigation guidance - Configured email notifications with manual confirmation workflow **Cross-cutting Infrastructure:** - Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate) - Structured logging implementation across all services (NestJS, Go, Rust) - Metrics collection following "Golden Four Signals" observability approach - Configurable thresholds and deployment-ready monitoring solution The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
155 lines
3.6 KiB
HCL
155 lines
3.6 KiB
HCL
variable "aws_region" {
|
|
description = "AWS region where resources will be created"
|
|
type = string
|
|
default = "us-east-1"
|
|
}
|
|
|
|
variable "environment" {
|
|
description = "Environment name (e.g., dev, staging, prod)"
|
|
type = string
|
|
default = "dev"
|
|
}
|
|
|
|
variable "project_name" {
|
|
description = "Name of the project"
|
|
type = string
|
|
default = "meteor"
|
|
}
|
|
|
|
# S3 Configuration
|
|
variable "s3_bucket_versioning" {
|
|
description = "Enable S3 bucket versioning"
|
|
type = bool
|
|
default = true
|
|
}
|
|
|
|
variable "s3_bucket_force_destroy" {
|
|
description = "Allow S3 bucket to be destroyed even if it contains objects"
|
|
type = bool
|
|
default = false
|
|
}
|
|
|
|
# SQS Configuration
|
|
variable "sqs_visibility_timeout_seconds" {
|
|
description = "SQS visibility timeout in seconds"
|
|
type = number
|
|
default = 300
|
|
}
|
|
|
|
variable "sqs_message_retention_seconds" {
|
|
description = "SQS message retention period in seconds"
|
|
type = number
|
|
default = 1209600 # 14 days
|
|
}
|
|
|
|
variable "sqs_max_receive_count" {
|
|
description = "Maximum number of receives before message goes to DLQ"
|
|
type = number
|
|
default = 3
|
|
}
|
|
|
|
# RDS Configuration (if using RDS instead of external PostgreSQL)
|
|
variable "enable_rds" {
|
|
description = "Enable RDS PostgreSQL instance"
|
|
type = bool
|
|
default = false
|
|
}
|
|
|
|
variable "rds_instance_class" {
|
|
description = "RDS instance class"
|
|
type = string
|
|
default = "db.t3.micro"
|
|
}
|
|
|
|
variable "rds_allocated_storage" {
|
|
description = "RDS allocated storage in GB"
|
|
type = number
|
|
default = 20
|
|
}
|
|
|
|
variable "rds_max_allocated_storage" {
|
|
description = "RDS maximum allocated storage in GB"
|
|
type = number
|
|
default = 100
|
|
}
|
|
|
|
# ECS/Fargate Configuration
|
|
variable "enable_fargate" {
|
|
description = "Enable ECS Fargate deployment"
|
|
type = bool
|
|
default = false
|
|
}
|
|
|
|
variable "web_backend_cpu" {
|
|
description = "CPU units for web backend service"
|
|
type = number
|
|
default = 256
|
|
}
|
|
|
|
variable "web_backend_memory" {
|
|
description = "Memory MB for web backend service"
|
|
type = number
|
|
default = 512
|
|
}
|
|
|
|
variable "compute_service_cpu" {
|
|
description = "CPU units for compute service"
|
|
type = number
|
|
default = 256
|
|
}
|
|
|
|
variable "compute_service_memory" {
|
|
description = "Memory MB for compute service"
|
|
type = number
|
|
default = 512
|
|
}
|
|
|
|
# Monitoring Configuration
|
|
variable "cloudwatch_log_retention_days" {
|
|
description = "CloudWatch log retention period in days"
|
|
type = number
|
|
default = 14
|
|
}
|
|
|
|
variable "enable_detailed_monitoring" {
|
|
description = "Enable detailed CloudWatch monitoring"
|
|
type = bool
|
|
default = true
|
|
}
|
|
|
|
# Alerting Configuration
|
|
variable "alert_email" {
|
|
description = "Email address to receive alert notifications"
|
|
type = string
|
|
default = ""
|
|
}
|
|
|
|
variable "nestjs_error_rate_threshold" {
|
|
description = "NestJS 5xx error rate threshold (percentage) that triggers alarm"
|
|
type = number
|
|
default = 1.0
|
|
}
|
|
|
|
variable "go_service_failure_rate_threshold" {
|
|
description = "Go service processing failure rate threshold (percentage) that triggers alarm"
|
|
type = number
|
|
default = 5.0
|
|
}
|
|
|
|
variable "sqs_queue_depth_threshold" {
|
|
description = "SQS queue depth threshold (number of visible messages) that triggers alarm"
|
|
type = number
|
|
default = 1000
|
|
}
|
|
|
|
variable "alarm_evaluation_periods" {
|
|
description = "Number of periods to evaluate for alarm state"
|
|
type = number
|
|
default = 1
|
|
}
|
|
|
|
variable "alarm_period_seconds" {
|
|
description = "Period in seconds for alarm evaluation"
|
|
type = number
|
|
default = 300
|
|
} |