grabbit ca7e92a1a1 🎉 Epic 3 Complete: Production Readiness & Observability
Successfully implemented comprehensive monitoring and alerting infrastructure for the Meteor platform across all three stories of Epic 3:

**Story 3.5: 核心业务指标监控 (Core Business Metrics Monitoring)**
- Instrumented NestJS web backend with CloudWatch metrics integration using prom-client
- Instrumented Go compute service with structured CloudWatch metrics reporting
- Created comprehensive Terraform infrastructure from scratch with modular design
- Built 5-row CloudWatch dashboard with application, error rate, business, and infrastructure metrics
- Added proper error categorization and provider performance tracking

**Story 3.6: 关键故障告警 (Critical System Alerts)**
- Implemented SNS-based alerting infrastructure via Terraform
- Created critical alarms for NestJS 5xx error rate (>1% threshold)
- Created Go service processing failure rate alarm (>5% threshold)
- Created SQS queue depth alarm (>1000 messages threshold)
- Added actionable alarm descriptions with investigation guidance
- Configured email notifications with manual confirmation workflow

**Cross-cutting Infrastructure:**
- Complete AWS infrastructure as code with Terraform (S3, SQS, CloudWatch, SNS, IAM, optional RDS/Fargate)
- Structured logging implementation across all services (NestJS, Go, Rust)
- Metrics collection following "Golden Four Signals" observability approach
- Configurable thresholds and deployment-ready monitoring solution

The platform now has production-grade observability with comprehensive metrics collection, centralized monitoring dashboards, and automated critical system alerting.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-03 23:42:01 +08:00

255 lines
7.7 KiB
Go

package logger
import (
"context"
"os"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
// ContextKey is used for storing values in context
type ContextKey string
const (
// CorrelationIDKey is the key for correlation ID in context
CorrelationIDKey ContextKey = "correlation_id"
)
// StructuredLogger provides standardized logging for the meteor compute service
type StructuredLogger struct {
logger zerolog.Logger
service string
version string
}
// LogEntry represents a standardized log entry
type LogEntry struct {
Timestamp string `json:"timestamp"`
Level string `json:"level"`
ServiceName string `json:"service_name"`
CorrelationID *string `json:"correlation_id"`
Message string `json:"message"`
Extra interface{} `json:",inline"`
}
// Field represents a key-value pair for structured logging
type Field struct {
Key string
Value interface{}
}
// NewStructuredLogger creates a new structured logger instance
func NewStructuredLogger(service, version string) *StructuredLogger {
// Configure zerolog based on environment
if os.Getenv("NODE_ENV") == "development" {
// Pretty printing for development
log.Logger = log.Output(zerolog.ConsoleWriter{
Out: os.Stdout,
TimeFormat: time.RFC3339,
NoColor: false,
})
} else {
// JSON output for production
zerolog.TimeFieldFormat = time.RFC3339
}
// Set log level
logLevel := os.Getenv("LOG_LEVEL")
switch logLevel {
case "debug":
zerolog.SetGlobalLevel(zerolog.DebugLevel)
case "info":
zerolog.SetGlobalLevel(zerolog.InfoLevel)
case "warn":
zerolog.SetGlobalLevel(zerolog.WarnLevel)
case "error":
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
default:
zerolog.SetGlobalLevel(zerolog.InfoLevel)
}
logger := log.With().
Str("service_name", service).
Str("version", version).
Logger()
return &StructuredLogger{
logger: logger,
service: service,
version: version,
}
}
// WithCorrelationID adds correlation ID to context
func WithCorrelationID(ctx context.Context, correlationID string) context.Context {
return context.WithValue(ctx, CorrelationIDKey, correlationID)
}
// GetCorrelationID retrieves correlation ID from context
func GetCorrelationID(ctx context.Context) *string {
if correlationID, ok := ctx.Value(CorrelationIDKey).(string); ok && correlationID != "" {
return &correlationID
}
return nil
}
// createLogEvent creates a zerolog event with common fields
func (l *StructuredLogger) createLogEvent(level zerolog.Level, ctx context.Context) *zerolog.Event {
event := l.logger.WithLevel(level).
Timestamp().
Str("service_name", l.service)
if correlationID := GetCorrelationID(ctx); correlationID != nil {
event = event.Str("correlation_id", *correlationID)
}
return event
}
// Info logs an info level message
func (l *StructuredLogger) Info(ctx context.Context, message string, fields ...Field) {
event := l.createLogEvent(zerolog.InfoLevel, ctx)
for _, field := range fields {
event = event.Interface(field.Key, field.Value)
}
event.Msg(message)
}
// Warn logs a warning level message
func (l *StructuredLogger) Warn(ctx context.Context, message string, fields ...Field) {
event := l.createLogEvent(zerolog.WarnLevel, ctx)
for _, field := range fields {
event = event.Interface(field.Key, field.Value)
}
event.Msg(message)
}
// Error logs an error level message
func (l *StructuredLogger) Error(ctx context.Context, message string, err error, fields ...Field) {
event := l.createLogEvent(zerolog.ErrorLevel, ctx)
if err != nil {
event = event.Err(err)
}
for _, field := range fields {
event = event.Interface(field.Key, field.Value)
}
event.Msg(message)
}
// Debug logs a debug level message
func (l *StructuredLogger) Debug(ctx context.Context, message string, fields ...Field) {
event := l.createLogEvent(zerolog.DebugLevel, ctx)
for _, field := range fields {
event = event.Interface(field.Key, field.Value)
}
event.Msg(message)
}
// Business-specific logging methods
// ProcessingEvent logs event processing information
func (l *StructuredLogger) ProcessingEvent(ctx context.Context, eventID, stage string, fields ...Field) {
allFields := append(fields,
Field{Key: "event_id", Value: eventID},
Field{Key: "processing_stage", Value: stage},
)
l.Info(ctx, "Event processing stage", allFields...)
}
// ValidationEvent logs validation-related events
func (l *StructuredLogger) ValidationEvent(ctx context.Context, eventID, algorithm string, isValid bool, score float64, fields ...Field) {
allFields := append(fields,
Field{Key: "event_id", Value: eventID},
Field{Key: "validation_algorithm", Value: algorithm},
Field{Key: "is_valid", Value: isValid},
Field{Key: "validation_score", Value: score},
)
l.Info(ctx, "Event validation completed", allFields...)
}
// DatabaseEvent logs database operations
func (l *StructuredLogger) DatabaseEvent(ctx context.Context, operation string, duration time.Duration, fields ...Field) {
allFields := append(fields,
Field{Key: "database_operation", Value: operation},
Field{Key: "duration_ms", Value: duration.Milliseconds()},
)
l.Debug(ctx, "Database operation completed", allFields...)
}
// SQSEvent logs SQS-related events
func (l *StructuredLogger) SQSEvent(ctx context.Context, operation, messageID string, fields ...Field) {
allFields := append(fields,
Field{Key: "sqs_operation", Value: operation},
Field{Key: "sqs_message_id", Value: messageID},
)
l.Info(ctx, "SQS operation", allFields...)
}
// StartupEvent logs application startup events
func (l *StructuredLogger) StartupEvent(ctx context.Context, component string, fields ...Field) {
allFields := append(fields,
Field{Key: "startup_component", Value: component},
)
l.Info(ctx, "Component initialized", allFields...)
}
// HealthEvent logs health check events
func (l *StructuredLogger) HealthEvent(ctx context.Context, component string, healthy bool, fields ...Field) {
allFields := append(fields,
Field{Key: "health_component", Value: component},
Field{Key: "healthy", Value: healthy},
)
if healthy {
l.Debug(ctx, "Health check passed", allFields...)
} else {
l.Warn(ctx, "Health check failed", allFields...)
}
}
// SecurityEvent logs security-related events
func (l *StructuredLogger) SecurityEvent(ctx context.Context, event string, fields ...Field) {
allFields := append(fields,
Field{Key: "security_event", Value: event},
)
l.Warn(ctx, "Security event detected", allFields...)
}
// PerformanceEvent logs performance metrics
func (l *StructuredLogger) PerformanceEvent(ctx context.Context, operation string, duration time.Duration, fields ...Field) {
allFields := append(fields,
Field{Key: "performance_operation", Value: operation},
Field{Key: "duration_ms", Value: duration.Milliseconds()},
)
l.Info(ctx, "Performance metric", allFields...)
}
// MetricsEvent logs metrics and statistics
func (l *StructuredLogger) MetricsEvent(ctx context.Context, metric string, value interface{}, fields ...Field) {
allFields := append(fields,
Field{Key: "metric_name", Value: metric},
Field{Key: "metric_value", Value: value},
)
l.Info(ctx, "Metrics data", allFields...)
}
// WorkerEvent logs worker-specific events
func (l *StructuredLogger) WorkerEvent(ctx context.Context, workerID int, event string, fields ...Field) {
allFields := append(fields,
Field{Key: "worker_id", Value: workerID},
Field{Key: "worker_event", Value: event},
)
l.Info(ctx, "Worker event", allFields...)
}
// NewField creates a field for structured logging
func NewField(key string, value interface{}) Field {
return Field{Key: key, Value: value}
}
// GetZerologLogger returns the underlying zerolog.Logger for external integrations
func (l *StructuredLogger) GetZerologLogger() zerolog.Logger {
return l.logger
}