grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

128 lines
3.2 KiB
Go

package config
import (
"os"
"strconv"
"time"
)
// Config holds the application configuration
type Config struct {
Port string
// Database configuration
DatabaseURL string
DatabaseMaxConns int32
DatabaseTimeout time.Duration
// SQS configuration
SQSQueueURL string
SQSRegion string
SQSMaxMessages int32
SQSWaitTimeSeconds int32
SQSVisibilityTimeout int32
// Processing configuration
ProcessingWorkers int
ProcessingBatchSize int
IdempotencyEnabled bool
// Validation configuration
ValidationProvider string
}
// Load loads configuration from environment variables with defaults
func Load() *Config {
port := os.Getenv("PORT")
if port == "" {
port = "8080"
}
databaseURL := os.Getenv("DATABASE_URL")
if databaseURL == "" {
databaseURL = "postgres://postgres:password@localhost:5432/meteor_development?sslmode=disable"
}
databaseMaxConns := parseInt32(os.Getenv("DATABASE_MAX_CONNS"), 10)
databaseTimeout := parseDuration(os.Getenv("DATABASE_TIMEOUT"), 30*time.Second)
sqsQueueURL := os.Getenv("SQS_QUEUE_URL")
if sqsQueueURL == "" {
sqsQueueURL = "https://sqs.us-east-1.amazonaws.com/123456789012/meteor-raw-events-queue"
}
sqsRegion := os.Getenv("SQS_REGION")
if sqsRegion == "" {
sqsRegion = "us-east-1"
}
sqsMaxMessages := parseInt32(os.Getenv("SQS_MAX_MESSAGES"), 10)
sqsWaitTimeSeconds := parseInt32(os.Getenv("SQS_WAIT_TIME_SECONDS"), 20)
sqsVisibilityTimeout := parseInt32(os.Getenv("SQS_VISIBILITY_TIMEOUT"), 300)
processingWorkers := parseInt(os.Getenv("PROCESSING_WORKERS"), 5)
processingBatchSize := parseInt(os.Getenv("PROCESSING_BATCH_SIZE"), 10)
idempotencyEnabled := parseBool(os.Getenv("IDEMPOTENCY_ENABLED"), true)
validationProvider := os.Getenv("VALIDATION_PROVIDER")
if validationProvider == "" {
validationProvider = "mvp" // Default to MVP provider for backward compatibility
}
return &Config{
Port: port,
DatabaseURL: databaseURL,
DatabaseMaxConns: databaseMaxConns,
DatabaseTimeout: databaseTimeout,
SQSQueueURL: sqsQueueURL,
SQSRegion: sqsRegion,
SQSMaxMessages: sqsMaxMessages,
SQSWaitTimeSeconds: sqsWaitTimeSeconds,
SQSVisibilityTimeout: sqsVisibilityTimeout,
ProcessingWorkers: processingWorkers,
ProcessingBatchSize: processingBatchSize,
IdempotencyEnabled: idempotencyEnabled,
ValidationProvider: validationProvider,
}
}
// Helper functions for parsing environment variables
func parseInt(s string, defaultValue int) int {
if s == "" {
return defaultValue
}
if val, err := strconv.Atoi(s); err == nil {
return val
}
return defaultValue
}
func parseInt32(s string, defaultValue int32) int32 {
if s == "" {
return defaultValue
}
if val, err := strconv.ParseInt(s, 10, 32); err == nil {
return int32(val)
}
return defaultValue
}
func parseBool(s string, defaultValue bool) bool {
if s == "" {
return defaultValue
}
if val, err := strconv.ParseBool(s); err == nil {
return val
}
return defaultValue
}
func parseDuration(s string, defaultValue time.Duration) time.Duration {
if s == "" {
return defaultValue
}
if val, err := time.ParseDuration(s); err == nil {
return val
}
return defaultValue
}